def setup_optimizers(self): params = self.params self.causal_conv_optimizers = [] for layer in self.causal_conv_layers: opt = optimizers.NesterovAG(lr=params.learning_rate, momentum=params.gradient_momentum) opt.setup(layer) opt.add_hook(optimizer.WeightDecay(params.weight_decay)) opt.add_hook(GradientClipping(params.gradient_clipping)) self.causal_conv_optimizers.append(opt) self.residual_conv_optimizers = [] for layer in self.residual_conv_layers: opt = optimizers.NesterovAG(lr=params.learning_rate, momentum=params.gradient_momentum) opt.setup(layer) opt.add_hook(optimizer.WeightDecay(params.weight_decay)) opt.add_hook(GradientClipping(params.gradient_clipping)) self.residual_conv_optimizers.append(opt) self.softmax_conv_optimizers = [] for layer in self.softmax_conv_layers: opt = optimizers.NesterovAG(lr=params.learning_rate, momentum=params.gradient_momentum) opt.setup(layer) opt.add_hook(optimizer.WeightDecay(params.weight_decay)) opt.add_hook(GradientClipping(params.gradient_clipping)) self.softmax_conv_optimizers.append(opt)
def optimizer(opt_str): """ 入力文字列からオプティマイザを推定する """ if (opt_str.lower() == 'adam'): opt = O.Adam(amsgrad=True) elif (opt_str.lower() == 'ada_d'): opt = O.AdaDelta() elif (opt_str.lower() == 'ada_g'): opt = O.AdaGrad() elif (opt_str.lower() == 'm_sgd'): opt = O.MomentumSGD() elif (opt_str.lower() == 'n_ag'): opt = O.NesterovAG() elif (opt_str.lower() == 'rmsp'): opt = O.RMSprop() elif (opt_str.lower() == 'rmsp_g'): opt = O.RMSpropGraves() elif (opt_str.lower() == 'sgd'): opt = O.SGD() elif (opt_str.lower() == 'smorms'): opt = O.SMORMS3() else: opt = O.Adam(amsgrad=True) logger.warning('{}->{}'.format(opt_str, opt.__doc__.split('.')[0])) logger.debug('Optimizer: {}'.format(opt.__doc__.split('.')[0])) return opt
def optimizer(opt_str): """ 入力文字列からオプティマイザを推定する """ if(opt_str.lower() == 'adam'): opt = O.Adam(amsgrad=True) elif(opt_str.lower() == 'ada_d'): opt = O.AdaDelta() elif(opt_str.lower() == 'ada_g'): opt = O.AdaGrad() elif(opt_str.lower() == 'm_sgd'): opt = O.MomentumSGD() elif(opt_str.lower() == 'n_ag'): opt = O.NesterovAG() elif(opt_str.lower() == 'rmsp'): opt = O.RMSprop() elif(opt_str.lower() == 'rmsp_g'): opt = O.RMSpropGraves() elif(opt_str.lower() == 'sgd'): opt = O.SGD() elif(opt_str.lower() == 'smorms'): opt = O.SMORMS3() else: opt = O.Adam(amsgrad=True) print('\n[Warning] {0}\n\t{1}->{2}\n'.format( fileFuncLine(), opt_str, opt.__doc__.split('.')[0]) ) print('Optimizer:', opt.__doc__.split('.')[0]) return opt
def which_is_best_optimizer(k=10, model=CNN()): k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.Adam(), tag='Adam') k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.SGD(), tag='SGD') k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.RMSpropGraves(), tag='RMSpropGraves') # k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.RMSprop(), tag='RMSprop') k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.AdaDelta(), tag='AdaDelta') k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.AdaGrad(), tag='AdaGrad') k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.MomentumSGD(), tag='MomentumSGD') k_fold_validation(k, copy.deepcopy(model), optimizer=optimizers.NesterovAG(), tag='NesterovAG')
def get_opt(args): if args.opt_model == "SGD": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 return optimizers.SGD(lr=alpha0) if args.opt_model == "AdaGrad": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 return optimizers.AdaGrad(lr=alpha0) if args.opt_model == "AdaDelta": alpha0 = 0.95 if args.alpha0 == 0 else args.alpha0 alpha1 = 1e-06 if args.alpha1 == 0 else args.alpha1 return optimizers.AdaDelta(rho=alpha0, eps=alpha1) if args.opt_model == "Momentum": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 alpha1 = 0.9 if args.alpha1 == 0 else args.alpha1 return optimizers.MomentumSGD(lr=alpha0, momentum=alpha1) if args.opt_model == "NAG": alpha0 = 0.01 if args.alpha0 == 0 else args.alpha0 alpha1 = 0.9 if args.alpha1 == 0 else args.alpha1 return optimizers.NesterovAG(lr=alpha0, momentum=alpha1) if args.opt_model == "RMS": return optimizers.RMSpropGraves() if args.opt_model == "SM": return optimizers.SMORMS3() if args.opt_model == "Adam": # default case alpha0 = 0.001 if args.alpha0 == 0 else args.alpha0 alpha1 = 0.9 if args.alpha1 == 0 else args.alpha1 alpha2 = 0.999 if args.alpha2 == 0 else args.alpha2 alpha3 = 1e-08 if args.alpha3 == 0 else args.alpha3 return optimizers.Adam(alpha=alpha0, beta1=alpha1, beta2=alpha2, eps=alpha3) print('no such optimization method', args.opt_model) sys.exit(1)
def get_optimizer(name, lr, momentum): if name == "sgd": return optimizers.SGD(lr=lr) if name == "msgd": return optimizers.MomentumSGD(lr=lr, momentum=momentum) if name == "nesterov": return optimizers.NesterovAG(lr=lr, momentum=momentum) if name == "adam": return optimizers.Adam(alpha=lr, beta1=momentum) raise NotImplementedError()
def main(): opt = opts.parse() model = net.ConvNet(opt.n_classes, opt.BC, opt.nobias, opt.dropout_ratio) if opt.gpu > -1: chainer.cuda.get_device_from_id(opt.gpu).use() model.to_gpu() optimizer = optimizers.NesterovAG(lr=opt.LR, momentum=opt.momentum) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(opt.weight_decay)) train_iter, val_iter = dataset.setup(opt) updater = training.StandardUpdater(train_iter, optimizer, device=opt.gpu) # Trainer trainer = training.Trainer(updater, (opt.n_epochs, 'epoch'), opt.save) trainer.extend(extensions.ExponentialShift('lr', 0.1, opt.LR), trigger=ManualScheduleTrigger(opt.schedule, 'epoch')) trainer.extend(extensions.Evaluator(val_iter, model, device=opt.gpu), trigger=(1, 'epoch')) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(filename='min_loss'), trigger=MinValueTrigger( key='validation/main/loss', trigger=(5, 'epoch'))) trainer.extend(extensions.snapshot(filename='max_accuracy'), trigger=MaxValueTrigger( key='validation/main/accuracy', trigger=(5, 'epoch'))) trainer.extend(extensions.snapshot_object(model, 'min_loss_model'), trigger=MinValueTrigger(key='validation/main/loss', trigger=(5, 'epoch'))) trainer.extend(extensions.snapshot_object(model, 'max_accuracy_model'), trigger=MaxValueTrigger(key='validation/main/accuracy', trigger=(5, 'epoch'))) trainer.extend(extensions.observe_lr()) trainer.extend(extensions.LogReport()) if extensions.PlotReport.available(): trainer.extend(extensions.PlotReport( ['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend(extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend(extensions.PlotReport( ['lr'], 'epoch', file_name='learning_rate.png')) trainer.extend(extensions.PrintReport(['elapsed_time', 'epoch', 'iteration', 'lr', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy'])) trainer.extend(extensions.ProgressBar(update_interval=25)) if opt.resume and os.path.exists(opt.resume): chainer.serializers.load_npz(opt.resume, trainer) # Run the training try: trainer.run() except Exception as e: import shutil import traceback print('\nerror message') print(traceback.format_exc()) shutil.rmtree(opt.save)
def get_optimizer(self, name, lr, momentum=0.9): if name.lower() == "adam": return optimizers.Adam(alpha=lr, beta1=momentum) if name.lower() == "smorms3": return optimizers.SMORMS3(lr=lr) if name.lower() == "adagrad": return optimizers.AdaGrad(lr=lr) if name.lower() == "adadelta": return optimizers.AdaDelta(rho=momentum) if name.lower() == "nesterov" or name.lower() == "nesterovag": return optimizers.NesterovAG(lr=lr, momentum=momentum) if name.lower() == "rmsprop": return optimizers.RMSprop(lr=lr, alpha=momentum) if name.lower() == "momentumsgd": return optimizers.MomentumSGD(lr=lr, mommentum=mommentum) if name.lower() == "sgd": return optimizers.SGD(lr=lr)
def select_optimizer(opt_name, learning_rate): if opt_name == "Adam": return optimizers.Adam(alpha=learning_rate) elif opt_name == "SGD": return optimizers.SGD(lr=learning_rate) elif opt_name == "RMSpropGraves": return optimizers.RMSpropGraves(lr=learning_rate) elif opt_name == "RMSprop": return optimizers.RMSprop(lr=learning_rate) elif opt_name == "AdaDelta": return optimizers.AdaDelta() elif opt_name == "AdaGrad": return optimizers.AdaGrad(lr=learning_rate) elif opt_name == "MomentumSGD": return optimizers.MomentumSGD(lr=learning_rate) elif opt_name == "NesterovAG": return optimizers.NesterovAG(lr=learning_rate) else: print('please select correct optimizer') exit()
def __init__(self): self.__start_time = time.time() self.__always_attn = get_arg('always_attn') self.__fold = 10 self.__batch_size = 32 self.__n_epoch = 10 self.__label_num = 2 self.__mem_units = get_arg('mem_units') self.__data_mode = get_arg('data') self.__is_toy = get_arg('toy') self.__dict = get_arg('pol_dict') self.__logdir = get_arg('logdir') self.__is_regression = get_arg('regression') self.__attention = get_arg('attention') self.__lr = get_arg('lr') self.__l2 = get_arg('l2') self.__clip_grad = get_arg('clip_grad') self.__composition = get_arg('composition') self.__n_units = 200 self.__render_graph = get_arg('not_render') self.__dropout = get_arg('dropout') if self.__data_mode == 'ntcire' or self.__data_mode.startswith('sst'): self.__n_units = 300 self.__not_embed = get_arg('not_embed') self.__attention_target = get_arg('attention_target') self.__forget_bias = get_arg('forget_bias') self.__only_attn = get_arg('only_attn') # optimizer self.__opt_name = get_arg('optimizer') if self.__opt_name == 'SGD': self.__opt = lambda: optimizers.SGD(lr=self.__lr) elif self.__opt_name == 'AdaDelta': self.__opt = lambda: optimizers.AdaDelta() elif self.__opt_name == 'Adam': self.__opt = lambda: optimizers.Adam(alpha=self.__lr) elif self.__opt_name == 'NesterovAg': self.__opt = lambda: optimizers.NesterovAG(lr=self.__lr) elif self.__opt_name == 'AdaGrad': self.__opt = lambda: optimizers.AdaGrad(lr=self.__lr) # data data_dir = utils.get_data_path() mecab_embedf = data_dir + '/vector/word2vec/wikiDump_mecab_size200_cbow.w2vModel' kytea_embedf = data_dir + '/vector/word2vec/wikiDump_kytea_size200_skipgram.w2vModel' en_embedf = data_dir + '/vector/glove/glove.840B.300d.txt' if self.__data_mode == 'ntcirj_con': data = data_dir + '/ntcirj/ckylark/data.pkl.bz2' self.__embedf = kytea_embedf if self.__data_mode == 'ntcirj_dep': data = data_dir + '/ntcirj/cabocha/data.pkl.bz2' self.__embedf = mecab_embedf elif self.__data_mode == 'ntcire': self.__embedf = en_embedf data = data_dir + '/ntcire/ckylark/data.pkl.bz2' elif self.__data_mode == 'tsukuba': self.__embedf = kytea_embedf data = data_dir + '/tsukuba/ckylark/data.pkl.bz2' elif self.__data_mode == 'sst_all': self.__embedf = en_embedf self.__label_num = 5 data = data_dir + '/sst_all/data.pkl.bz2' elif self.__data_mode == 'sst_cut': self.__embedf = en_embedf self.__label_num = 5 data = data_dir + '/sst_cut/data.pkl.bz2' data = utils.read_pkl_bz2(data) if self.__is_toy: data = data['toy'] self.__n_epoch = 3 if self.__dict == 'pn': data = data['poldict'] elif self.__dict == 'pnn': data = data['poldict_neutral'] self.__data = data self.mk_logfiles() self.print_params()
def do_train(config_training): src_indexer, tgt_indexer = load_voc_and_update_training_config(config_training) save_prefix = config_training.training_management.save_prefix output_files_dict = {} output_files_dict["train_config"] = save_prefix + ".train.config" output_files_dict["model_ckpt"] = save_prefix + ".model." + "ckpt" + ".npz" output_files_dict["model_final"] = save_prefix + \ ".model." + "final" + ".npz" output_files_dict["model_best"] = save_prefix + ".model." + "best" + ".npz" output_files_dict["model_best_loss"] = save_prefix + ".model." + "best_loss" + ".npz" # output_files_dict["model_ckpt_config"] = save_prefix + ".model." + "ckpt" + ".config" # output_files_dict["model_final_config"] = save_prefix + ".model." + "final" + ".config" # output_files_dict["model_best_config"] = save_prefix + ".model." + "best" + ".config" # output_files_dict["model_best_loss_config"] = save_prefix + ".model." + "best_loss" + ".config" output_files_dict["test_translation_output"] = save_prefix + ".test.out" output_files_dict["test_src_output"] = save_prefix + ".test.src.out" output_files_dict["dev_translation_output"] = save_prefix + ".dev.out" output_files_dict["dev_src_output"] = save_prefix + ".dev.src.out" output_files_dict["valid_translation_output"] = save_prefix + ".valid.out" output_files_dict["valid_src_output"] = save_prefix + ".valid.src.out" output_files_dict["sqlite_db"] = save_prefix + ".result.sqlite" output_files_dict["optimizer_ckpt"] = save_prefix + ".optimizer." + "ckpt" + ".npz" output_files_dict["optimizer_final"] = save_prefix + ".optimizer." + "final" + ".npz" save_prefix_dir, save_prefix_fn = os.path.split(save_prefix) ensure_path(save_prefix_dir) already_existing_files = [] for key_info, filename in output_files_dict.iteritems(): # , valid_data_fn]: if os.path.exists(filename): already_existing_files.append(filename) if len(already_existing_files) > 0: print "Warning: existing files are going to be replaced / updated: ", already_existing_files if not config_training.training_management.force_overwrite: raw_input("Press Enter to Continue") save_train_config_fn = output_files_dict["train_config"] log.info("Saving training config to %s" % save_train_config_fn) config_training.save_to(save_train_config_fn) # json.dump(config_training, open(save_train_config_fn, "w"), indent=2, separators=(',', ': ')) Vi = len(src_indexer) # + UNK Vo = len(tgt_indexer) # + UNK eos_idx = Vo data_fn = config_training.data.data_fn log.info("loading training data from %s" % data_fn) training_data_all = json.load(gzip.open(data_fn, "rb")) training_data = training_data_all["train"] log.info("loaded %i sentences as training data" % len(training_data)) if "test" in training_data_all: test_data = training_data_all["test"] log.info("Found test data: %i sentences" % len(test_data)) else: test_data = None log.info("No test data found") if "dev" in training_data_all: dev_data = training_data_all["dev"] log.info("Found dev data: %i sentences" % len(dev_data)) else: dev_data = None log.info("No dev data found") if "valid" in training_data_all: valid_data = training_data_all["valid"] log.info("Found valid data: %i sentences" % len(valid_data)) else: valid_data = None log.info("No valid data found") max_src_tgt_length = config_training.training_management.max_src_tgt_length if max_src_tgt_length is not None: log.info("filtering sentences of length larger than %i" % (max_src_tgt_length)) filtered_training_data = [] nb_filtered = 0 for src, tgt in training_data: if len(src) <= max_src_tgt_length and len( tgt) <= max_src_tgt_length: filtered_training_data.append((src, tgt)) else: nb_filtered += 1 log.info("filtered %i sentences of length larger than %i" % (nb_filtered, max_src_tgt_length)) training_data = filtered_training_data if not config_training.training.no_shuffle_of_training_data: log.info("shuffling") import random random.shuffle(training_data) log.info("done") encdec, _, _, _ = create_encdec_and_indexers_from_config_dict(config_training, src_indexer=src_indexer, tgt_indexer=tgt_indexer, load_config_model="if_exists" if config_training.training_management.resume else "no") # create_encdec_from_config_dict(config_training.model, src_indexer, tgt_indexer, # load_config_model = "if_exists" if config_training.training_management.resume else "no") # if config_training.training_management.resume: # if "model_parameters" not in config_training: # log.error("cannot find model parameters in config file") # if config_training.model_parameters.type == "model": # model_filename = config_training.model_parameters.filename # log.info("resuming from model parameters %s" % model_filename) # serializers.load_npz(model_filename, encdec) if config_training.training_management.load_model is not None: log.info("loading model parameters from %s", config_training.training_management.load_model) serializers.load_npz(config_training.training_management.load_model, encdec) gpu = config_training.training_management.gpu if gpu is not None: encdec = encdec.to_gpu(gpu) if config_training.training.optimizer == "adadelta": optimizer = optimizers.AdaDelta() elif config_training.training.optimizer == "adam": optimizer = optimizers.Adam() elif config_training.training.optimizer == "adagrad": optimizer = optimizers.AdaGrad(lr=config_training.training.learning_rate) elif config_training.training.optimizer == "sgd": optimizer = optimizers.SGD(lr=config_training.training.learning_rate) elif config_training.training.optimizer == "momentum": optimizer = optimizers.MomentumSGD(lr=config_training.training.learning_rate, momentum=config_training.training.momentum) elif config_training.training.optimizer == "nesterov": optimizer = optimizers.NesterovAG(lr=config_training.training.learning_rate, momentum=config_training.training.momentum) elif config_training.training.optimizer == "rmsprop": optimizer = optimizers.RMSprop(lr=config_training.training.learning_rate) elif config_training.training.optimizer == "rmspropgraves": optimizer = optimizers.RMSpropGraves(lr=config_training.training.learning_rate, momentum=config_training.training.momentum) else: raise NotImplemented with cuda.get_device(gpu): optimizer.setup(encdec) if config_training.training.l2_gradient_clipping is not None and config_training.training.l2_gradient_clipping > 0: optimizer.add_hook(chainer.optimizer.GradientClipping( config_training.training.l2_gradient_clipping)) if config_training.training.hard_gradient_clipping is not None and config_training.training.hard_gradient_clipping > 0: optimizer.add_hook(chainer.optimizer.GradientHardClipping( *config_training.training.hard_gradient_clipping)) if config_training.training.weight_decay is not None: optimizer.add_hook( chainer.optimizer.WeightDecay( config_training.training.weight_decay)) if config_training.training_management.load_optimizer_state is not None: with cuda.get_device(gpu): log.info("loading optimizer parameters from %s", config_training.training_management.load_optimizer_state) serializers.load_npz(config_training.training_management.load_optimizer_state, optimizer) if config_training.training_management.timer_hook: timer_hook = profiling_tools.MyTimerHook else: import contextlib @contextlib.contextmanager def timer_hook(): yield import training_chainer with cuda.get_device(gpu): with timer_hook() as timer_infos: if config_training.training_management.max_nb_iters is not None: stop_trigger = ( config_training.training_management.max_nb_iters, "iteration") if config_training.training_management.max_nb_epochs is not None: log.warn( "max_nb_iters and max_nb_epochs both specified. Only max_nb_iters will be considered.") elif config_training.training_management.max_nb_epochs is not None: stop_trigger = ( config_training.training_management.max_nb_epochs, "epoch") else: stop_trigger = None training_chainer.train_on_data_chainer(encdec, optimizer, training_data, output_files_dict, src_indexer, tgt_indexer, eos_idx=eos_idx, config_training=config_training, stop_trigger=stop_trigger, test_data=test_data, dev_data=dev_data, valid_data=valid_data )
def set_optimizer(self, optimizer, learning_rate_init, weight_decay=0, clip_grad_norm=5, lr_schedule=None, factor=None, patience_epoch=None): """Set the optimizer and add hooks Args: optimizer (string): sgd or adam or adadelta or adagrad or rmsprop learning_rate_init (float): An initial learning rate weight_decay (float, optional): L2 penalty clip_grad_norm (float): lr_schedule: not used here factor: not used here patience_epoch: not used here Returns: scheduler (): """ optimizer = optimizer.lower() if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) if optimizer == 'adadelta': self.optimizer = optimizers.AdaDelta(rho=0.95, eps=1e-6) # TODO: check learning rate elif optimizer == 'adagrad': self.optimizer = optimizers.AdaGrad(lr=learning_rate_init, eps=1e-8) elif optimizer == 'adam': self.optimizer = optimizers.Adam(alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8) # TODO: check learning rate elif optimizer == 'sgd': self.optimizer = optimizers.MomentumSGD(lr=learning_rate_init, momentum=0.9) elif optimizer == 'nesterov': self.optimizer = optimizers.NesterovAG(lr=learning_rate_init, momentum=0.9) elif optimizer == 'rmsprop': self.optimizer = optimizers.RMSprop(lr=learning_rate_init, alpha=0.99, eps=1e-8) elif optimizer == 'rmspropgraves': self.optimizer = optimizers.RMSpropGraves(lr=learning_rate_init, alpha=0.95, momentum=0.9, eps=0.0001) else: raise NotImplementedError self.optimizer.setup(self) # Add hook self.optimizer.add_hook( chainer.optimizer.GradientClipping(clip_grad_norm)) self.optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) # self.optimizer.add_hook(chainer.optimizer.GradientNoise(eta=0.01)) return None
def create(self): return optimizers.NesterovAG(0.1)
# init RAM model and set optimizer from ram import RAM model = RAM(g_size=g_size, n_steps=n_steps, n_scales=n_scales, var=variance, use_lstm=args.lstm) if not args.lstm: data = model.core_hh.W.data data[:] = np.identity(data.shape[0], dtype=np.float32) lr_base = 1e-2 optimizer = optimizers.NesterovAG(lr=lr_base) optimizer.use_cleargrads() optimizer.setup(model) if args.model is not None: print('load model from {}'.format(args.model)) serializers.load_hdf5(args.model, model) if args.resume is not None: print('load optimizer state from {}'.format(args.resume)) serializers.load_hdf5(args.resume, optimizer) # GPU/CPU gpuid = args.gpuid if gpuid >= 0: cuda.get_device(gpuid).use()
def run_nn_vae(q, optimizer_nm, train_x, train_real, train_y, test_x, test_y, cross_val, nn_n_hidden, vae_n_hidden, n_z, n_batch, nn_n_epochs, vae_n_epochs, n_epochs_tuning, activation, grad_clip, noise_nm, gpu=-1): # np.random.seed(123) # random値を固定 n_x = train_x.shape[1] n_real = train_real.shape[1] n_y = train_y.shape[1] nn_n_layers = len(nn_n_hidden) vae_n_hidden_recog = vae_n_hidden vae_n_hidden_gen = vae_n_hidden[::-1] vae_n_layers_recog = len(vae_n_hidden_recog) vae_n_layers_gen = len(vae_n_hidden_gen) """NN pre_train""" layers = {} # Recognition model. nn_layer_sizes = [(n_x, nn_n_hidden[0])] if nn_n_layers > 1: nn_layer_sizes += zip(nn_n_hidden[:-1], nn_n_hidden[1:]) nn_layer_sizes += [(nn_n_hidden[-1], n_real)] for i, (n_incoming, n_outgoing) in enumerate(nn_layer_sizes): layers['nn_layer_%i' % i] = F.Linear(n_incoming, n_outgoing) """VAE pre_train""" # Recognition model. vae_rec_layer_sizes = [(n_real, vae_n_hidden_recog[0])] if vae_n_layers_recog > 1: vae_rec_layer_sizes += zip(vae_n_hidden_recog[:-1], vae_n_hidden_recog[1:]) vae_rec_layer_sizes += [(vae_n_hidden_recog[-1], n_z)] for i, (n_incoming, n_outgoing) in enumerate(vae_rec_layer_sizes): layers['vae_recog_%i' % i] = F.Linear(n_incoming, n_outgoing) layers['log_sigma'] = F.Linear(vae_n_hidden_recog[-1], n_z) # Generating model. vae_gen_layer_sizes = [(n_z, vae_n_hidden_gen[0])] if vae_n_layers_recog > 1: vae_gen_layer_sizes += zip(vae_n_hidden_gen[:-1], vae_n_hidden_gen[1:]) vae_gen_layer_sizes += [(vae_n_hidden_gen[-1], n_real)] for i, (n_incoming, n_outgoing) in enumerate(vae_gen_layer_sizes): layers['vae_gen_%i' % i] = F.Linear(n_incoming, n_outgoing) layers['output'] = F.Linear(n_z, n_y) model = NN_VAE(**layers) if gpu >= 0: cuda.init(gpu) model.to_gpu() # use Adam optimizers_dict = { "Adam": optimizers.Adam(), "AdaDelta": optimizers.AdaDelta(), "AdaGrad": optimizers.AdaGrad(), "MomentumSGD": optimizers.MomentumSGD(), "NesterovAG": optimizers.NesterovAG(), "RMSprop": optimizers.RMSprop(), "SGD": optimizers.SGD() } optimizer = optimizers_dict[optimizer_nm] optimizer.setup(model.collect_parameters()) total_nn_losses = [] if cross_val >= 0: print('{}s pre-train start ...'.format(cross_val)) # pre_train_NN start for epoch in xrange(1, nn_n_epochs + 1): t1 = time.time() # np.random.seed(123) indexes = np.random.permutation(train_x.shape[0]) nn_total_loss = 0.0 nn_out_list = np.zeros(train_real.shape) noisy_train_x = np.array(noisy(noise_nm, train_x), dtype=np.float32) for i in xrange(0, train_x.shape[0], n_batch): noisy_x_batch = noisy_train_x[indexes[i:i + n_batch]] real_batch = train_real[indexes[i:i + n_batch]] if gpu >= 0: noisy_x_batch = cuda.to_gpu(noisy_x_batch) optimizer.zero_grads() loss, nn_out = model.nn_forward(noisy_x_batch, real_batch, nn_n_layers, nonlinear=activation, gpu=-1, train=True) nn_total_loss += float(loss.data) * len(noisy_x_batch) loss.backward() optimizer.clip_grads(grad_clip) optimizer.update() nn_out_list[indexes[i:i + n_batch]] = nn_out.data total_nn_losses.append(nn_total_loss / train_x.shape[0]) # pre_train_VAE start total_vae_losses = [] if cross_val >= 0: print('{}s tuning start ...'.format(cross_val)) nn_out_list = np.array(nn_out_list, dtype=np.float32) noisy_nn_out_list = np.array(noisy(noise_nm, nn_out_list), dtype=np.float32) for epoch in xrange(1, vae_n_epochs + 1): # np.random.seed(123) indexes = np.random.permutation(train_x.shape[0]) total_loss = 0.0 noisy_nn_out_list = np.array(noisy(noise_nm, nn_out_list), dtype=np.float32) for i in xrange(0, train_x.shape[0], n_batch): noisy_nn_out_list_batch = noisy_nn_out_list[indexes[i:i + n_batch]] nn_out_list_batch = nn_out_list[indexes[i:i + n_batch]] real_batch = train_real[indexes[i:i + n_batch]] if gpu >= 0: noisy_nn_out_list_batch = cuda.to_gpu(noisy_nn_out_list_batch) optimizer.zero_grads() rec_loss, kl_loss, output = model.vae_forward( noisy_nn_out_list_batch, real_batch, vae_n_layers_recog, vae_n_layers_gen, nonlinear_q=activation, nonlinear_p=activation, train=True) loss = rec_loss + kl_loss total_loss += float(loss.data) * len(noisy_nn_out_list_batch) loss.backward() optimizer.clip_grads(grad_clip) optimizer.update() total_vae_losses.append(total_loss / train_x.shape[0]) # train_test_NN_VAE start total_nn_vae_losses = [] total_test_losses = [] total_train_losses = [] if cross_val >= 0: print('{}s tuning start ...'.format(cross_val)) for epoch in xrange(1, n_epochs_tuning + 1): noisy_train_x = np.array(noisy(noise_nm, train_x), dtype=np.float32) #np.random.seed(123) indexes = np.random.permutation(train_x.shape[0]) total_loss = 0.0 for i in xrange(0, train_x.shape[0], n_batch): noisy_x_batch = noisy_train_x[indexes[i:i + n_batch]] y_batch = train_y[indexes[i:i + n_batch]] if gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) optimizer.zero_grads() loss, predict_score = model.nn_vae_tuning(noisy_x_batch, y_batch, nn_n_layers, vae_n_layers_recog, nonlinear_q=activation, train=True) loss = loss**0.5 total_loss += float(loss.data) * len(noisy_x_batch) loss.backward() optimizer.clip_grads(grad_clip) optimizer.update() total_nn_vae_losses.append(total_loss / train_x.shape[0]) # test sum_loss_train = 0 for i in xrange(0, train_x.shape[0], n_batch): x_batch = train_x[indexes[i:i + n_batch]] y_batch = train_y[indexes[i:i + n_batch]] if gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) loss, predict_score = model.nn_vae_tuning(x_batch, y_batch, nn_n_layers, vae_n_layers_recog, nonlinear_q=activation, train=False) loss = loss**0.5 sum_loss_train += float(loss.data) * len(noisy_x_batch) total_train_losses.append(sum_loss_train / train_x.shape[0]) x_batch = test_x y_batch = test_y loss, predict_score = model.nn_vae_tuning(x_batch, y_batch, nn_n_layers, vae_n_layers_recog, nonlinear_q=activation, train=False) loss = loss**0.5 total_test_losses.append(loss.data) q.put([ total_nn_losses, total_vae_losses, total_nn_vae_losses, total_train_losses, total_test_losses ])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--dataset', '-d', default='cifar10', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--model', '-m', type=str, default=None) parser.add_argument('--opt', type=str, default=None) parser.add_argument('--epoch', '-e', type=int, default=40) parser.add_argument('--looptimes', '-t', type=int, default=5) parser.add_argument('--lr', '-l', type=float, default=0.01) parser.add_argument('--batch', '-b', type=int, default=128) parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') args = parser.parse_args() if args.dataset == 'cifar10': print('Using CIFAR10 dataset.') class_labels = 10 train, test = get_cifar10() elif args.dataset == 'cifar100': print('Using CIFAR100 dataset.') class_labels = 100 train, test = get_cifar100() else: raise RuntimeError('Invalid dataset choice.') # Set up a neural network to train. model = L.Classifier( network.LocalPCN(class_labels=class_labels, LoopTimes=args.looptimes)) if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU optimizer = optimizers.NesterovAG(lr=args.lr, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(1e-3)) num_train_samples = 45000 train_iter = iterators.SerialIterator(train[:num_train_samples], batch_size=args.batch, shuffle=True) test_iter = iterators.SerialIterator(train[num_train_samples:], batch_size=args.batch, repeat=False, shuffle=False) if args.model != None: print("loading model from " + args.model) serializers.load_npz(args.model, model) if args.opt != None: print("loading opt from " + args.opt) serializers.load_npz(args.opt, optimizer) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out='results') trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) trainer.extend(extensions.LogReport(trigger=(10, 'iteration'))) trainer.extend(extensions.observe_lr(), trigger=(10, 'iteration')) # Schedule of a learning rate (LinearShift) trainer.extend( extensions.LinearShift('lr', (args.lr, args.lr * 0.1), (args.epoch * 0.5, args.epoch * 0.5 + 1)), trigger=(1, 'epoch')) # Save two plot images to the result dir if args.plot and extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend(extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'lr', 'elapsed_time' ]), trigger=(1, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=1)) #Plot computation graph trainer.extend(extensions.dump_graph('main/loss')) # Train trainer.run() # Save results modelname = "./results/model" print("saving model to " + modelname) serializers.save_npz(modelname, model) optimizername = "./results/optimizer" print("saving optimizer to " + optimizername) serializers.save_npz(optimizername, optimizer)
def SetupOptimizer(model): opt = optimizers.NesterovAG(lr=args.optimizer['lr'], momentum=args.optimizer['momentum']) opt.setup(model) return opt
with self.init_scope(): self.l1 = L.Linear(None, n_mid_units) self.l2 = L.Linear(None, n_mid_units) self.l3 = L.Linear(None, n_out) def forward(self, x): h1 = F.relu(self.l1(x)) h2 = F.relu(self.l2(h1)) return self.l3(h2) # Wrap the model by Classifier model = MyNeuralNetwork(n_mid_units=mid_size, n_out=out_size) # Give optimizing method to the model optimizer = optimizers.NesterovAG() optimizer.setup(model) # Assign GPU or CPU to the model if gpu_id >= 0: cuda.get_device(gpu_id).use() model.to_gpu(gpu_id) cp = cuda.cupy # 問題2 # Triplet lossを用いて1で定義したモデルを訓練せよ. # その際,正常品と正常品および不良品と不良品の間の距離は小さく, # 正常品と不良品の間の距離は大きくなるようすること. # An Index for searching anchor image of a triplet triplet_pos = 0
conv5=F.Convolution2D(32, 64, (1, 4)), conv6=F.Convolution2D(64, 64, (1, 4)), conv7=F.Convolution2D(64, 128, (1, 2)), conv8=F.Convolution2D(128, 128, (1, 2)), conv9=F.Convolution2D(128, 256, (1, 2)), conv10=F.Convolution2D(256, 256, (1, 2)), fc11=F.Linear(256 * 10 * 1, 1024), norm1=F.BatchNormalization(1024), fc12=F.Linear(1024, 1024), norm2=F.BatchNormalization(1024), fc13=F.Linear(1024, 3)) #optimizer = optimizers.MomentumSGD(lr=LR, momentum=0.9) #optimizer = optimizers.SMORMS3(lr=LR, eps=1e-16) #optimizer = optimizers.AdaGrad(lr=LR) optimizer = optimizers.NesterovAG(lr=LR, momentum=0.9) optimizer.setup(model) if GPU_ID >= 0: cuda.get_device(GPU_ID).use() model.to_gpu(GPU_ID) print 'show train_index,test_index' print train_index print test_index print 'Fold %d ' % (c_f) N_test = test_index.shape[0] max_accuracy = 0
def command_line(arguments=None): import argparse parser = argparse.ArgumentParser( description="Train a RNNSearch model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "data_prefix", help="prefix of the training data created by make_data.py") parser.add_argument( "save_prefix", help="prefix to be added to all files created during the training") parser.add_argument("--gpu", type=int, nargs="+", default=None, help="specify gpu number to use, if any") #parser.add_argument("--gpulist", type = int, nargs = "+", default = None, help = "specify gpu number to use, if any") parser.add_argument( "--load_model", help="load the parameters of a previously trained model") parser.add_argument("--load_optimizer_state", help="load previously saved optimizer states") parser.add_argument("--Ei", type=int, default=620, help="Source words embedding size.") parser.add_argument("--Eo", type=int, default=620, help="Target words embedding size.") parser.add_argument("--Hi", type=int, default=1000, help="Source encoding layer size.") parser.add_argument("--Ho", type=int, default=1000, help="Target hidden layer size.") parser.add_argument("--Ha", type=int, default=1000, help="Attention Module Hidden layer size.") parser.add_argument("--Hl", type=int, default=500, help="Maxout output size.") parser.add_argument("--mb_size", type=int, default=80, help="Minibatch size") parser.add_argument("--nb_batch_to_sort", type=int, default=20, help="Sort this many batches by size.") parser.add_argument("--noise_on_prev_word", default=False, action="store_true") parser.add_argument( "--use_memory_optimization", default=False, action="store_true", help="Experimental option that could strongly reduce memory used.") parser.add_argument("--max_nb_iters", type=int, default=None, help="maximum number of iterations") parser.add_argument("--max_src_tgt_length", type=int, help="Limit length of training sentences") parser.add_argument("--l2_gradient_clipping", type=float, default=1, help="L2 gradient clipping. 0 for None") parser.add_argument("--hard_gradient_clipping", type=float, nargs=2, help="hard gradient clipping.") parser.add_argument("--weight_decay", type=float, help="Weight decay value. ") parser.add_argument("--optimizer", choices=[ "sgd", "rmsprop", "rmspropgraves", "momentum", "nesterov", "adam", "adagrad", "adadelta" ], default="adam", help="Optimizer type.") parser.add_argument("--learning_rate", type=float, default=0.01, help="Learning Rate") parser.add_argument("--momentum", type=float, default=0.9, help="Momentum term") parser.add_argument("--report_every", type=int, default=200, help="report every x iterations") parser.add_argument("--randomized_data", default=False, action="store_true") parser.add_argument("--use_accumulated_attn", default=False, action="store_true") parser.add_argument("--use_deep_attn", default=False, action="store_true") parser.add_argument("--no_shuffle_of_training_data", default=False, action="store_true") parser.add_argument("--no_resume", default=False, action="store_true") parser.add_argument("--init_orth", default=False, action="store_true") parser.add_argument("--reverse_src", default=False, action="store_true") parser.add_argument("--reverse_tgt", default=False, action="store_true") parser.add_argument("--curiculum_training", default=False, action="store_true") parser.add_argument("--use_bn_length", default=0, type=int) parser.add_argument("--use_previous_prediction", default=0, type=float) parser.add_argument("--no_report_or_save", default=False, action="store_true") parser.add_argument( "--lexical_probability_dictionary", help= "lexical translation probabilities in zipped JSON format. Used to implement https://arxiv.org/abs/1606.02006" ) parser.add_argument( "--lexicon_prob_epsilon", default=1e-3, type=float, help="epsilon value for combining the lexical probabilities") parser.add_argument( "--encoder_cell_type", default="lstm", help= "cell type of encoder. format: type,param1:val1,param2:val2,... where type is in [%s]" % (" ".join(rnn_cells.cell_dict.keys()))) parser.add_argument( "--decoder_cell_type", default="lstm", help="cell type of decoder. format same as for encoder") parser.add_argument("--sample_every", default=200, type=int) parser.add_argument("--save_ckpt_every", default=4000, type=int) parser.add_argument("--use_reinf", default=False, action="store_true") parser.add_argument("--is_multitarget", default=False, action="store_true") parser.add_argument( "--postprocess", default=False, action="store_true", help= "This flag indicates whether the translations should be postprocessed or not. For now it simply indicates that the BPE segmentation should be undone." ) args = parser.parse_args(args=arguments) output_files_dict = {} output_files_dict["train_config"] = args.save_prefix + ".train.config" output_files_dict[ "model_ckpt"] = args.save_prefix + ".model." + "ckpt" + ".npz" output_files_dict[ "model_final"] = args.save_prefix + ".model." + "final" + ".npz" output_files_dict[ "model_best"] = args.save_prefix + ".model." + "best" + ".npz" output_files_dict[ "model_best_loss"] = args.save_prefix + ".model." + "best_loss" + ".npz" output_files_dict[ "test_translation_output"] = args.save_prefix + ".test.out" output_files_dict["test_src_output"] = args.save_prefix + ".test.src.out" output_files_dict["dev_translation_output"] = args.save_prefix + ".dev.out" output_files_dict["dev_src_output"] = args.save_prefix + ".dev.src.out" output_files_dict[ "valid_translation_output"] = args.save_prefix + ".valid.out" output_files_dict["valid_src_output"] = args.save_prefix + ".valid.src.out" output_files_dict["sqlite_db"] = args.save_prefix + ".result.sqlite" output_files_dict[ "optimizer_ckpt"] = args.save_prefix + ".optimizer." + "ckpt" + ".npz" output_files_dict[ "optimizer_final"] = args.save_prefix + ".optimizer." + "final" + ".npz" save_prefix_dir, save_prefix_fn = os.path.split(args.save_prefix) ensure_path(save_prefix_dir) already_existing_files = [] for key_info, filename in output_files_dict.iteritems( ): #, valid_data_fn]: if os.path.exists(filename): already_existing_files.append(filename) if len(already_existing_files) > 0: print "Warning: existing files are going to be replaced / updated: ", already_existing_files #raw_input("Press Enter to Continue") config_fn = args.data_prefix + ".data.config" voc_fn = args.data_prefix + ".voc" data_fn = args.data_prefix + ".data.json.gz" log.info("loading training data from %s" % data_fn) training_data_all = json.load(gzip.open(data_fn, "rb")) training_data = training_data_all["train"] log.info("loaded %i sentences as training data" % len(training_data)) if "test" in training_data_all: test_data = training_data_all["test"] log.info("Found test data: %i sentences" % len(test_data)) else: test_data = None log.info("No test data found") if "dev" in training_data_all: dev_data = training_data_all["dev"] log.info("Found dev data: %i sentences" % len(dev_data)) else: dev_data = None log.info("No dev data found") if "valid" in training_data_all: valid_data = training_data_all["valid"] log.info("Found valid data: %i sentences" % len(valid_data)) else: valid_data = None log.info("No valid data found") log.info("loading voc from %s" % voc_fn) src_voc, tgt_voc = json.load(open(voc_fn)) src_indexer = Indexer.make_from_serializable(src_voc) tgt_indexer = Indexer.make_from_serializable(tgt_voc) tgt_voc = None src_voc = None # Vi = len(src_voc) + 1 # + UNK # Vo = len(tgt_voc) + 1 # + UNK Vi = len(src_indexer) # + UNK Vo = len(tgt_indexer) # + UNK if args.lexical_probability_dictionary is not None: log.info("opening lexical_probability_dictionary %s" % args.lexical_probability_dictionary) lexical_probability_dictionary_all = json.load( gzip.open(args.lexical_probability_dictionary, "rb")) log.info("computing lexical_probability_dictionary_indexed") lexical_probability_dictionary_indexed = {} for ws in lexical_probability_dictionary_all: ws_idx = src_indexer.convert([ws])[0] if ws_idx in lexical_probability_dictionary_indexed: assert src_indexer.is_unk_idx(ws_idx) else: lexical_probability_dictionary_indexed[ws_idx] = {} for wt in lexical_probability_dictionary_all[ws]: wt_idx = tgt_indexer.convert([wt])[0] if wt_idx in lexical_probability_dictionary_indexed[ws_idx]: assert src_indexer.is_unk_idx( ws_idx) or tgt_indexer.is_unk_idx(wt_idx) lexical_probability_dictionary_indexed[ws_idx][ wt_idx] += lexical_probability_dictionary_all[ws][wt] else: lexical_probability_dictionary_indexed[ws_idx][ wt_idx] = lexical_probability_dictionary_all[ws][wt] lexical_probability_dictionary = lexical_probability_dictionary_indexed else: lexical_probability_dictionary = None if args.max_src_tgt_length is not None: log.info("filtering sentences of length larger than %i" % (args.max_src_tgt_length)) filtered_training_data = [] nb_filtered = 0 for src, tgt in training_data: if len(src) <= args.max_src_tgt_length and len( tgt) <= args.max_src_tgt_length: filtered_training_data.append((src, tgt)) else: nb_filtered += 1 log.info("filtered %i sentences of length larger than %i" % (nb_filtered, args.max_src_tgt_length)) training_data = filtered_training_data if not args.no_shuffle_of_training_data: log.info("shuffling") import random random.shuffle(training_data) log.info("done") # # Vi = len(src_voc) + 1 # + UNK # Vo = len(tgt_voc) + 1 # + UNK is_multitarget = args.is_multitarget config_training = { "command_line": args.__dict__, "Vi": Vi, "Vo": Vo, "voc": voc_fn, "data": data_fn, "is_multitarget": is_multitarget } save_train_config_fn = output_files_dict["train_config"] log.info("Saving training config to %s" % save_train_config_fn) with io.open(save_train_config_fn, 'w', encoding="utf-8") as outfile: outfile.write(unicode(json.dumps(config_training, ensure_ascii=False))) #json.dump(config_training, open(save_train_config_fn, "w"), indent=2, separators=(',', ': ')) eos_idx = Vo # Selecting Attention type attn_cls = models.AttentionModule if args.use_accumulated_attn: raise NotImplemented # encdec = models.EncoderDecoder(Vi, args.Ei, args.Hi, Vo + 1, args.Eo, args.Ho, args.Ha, args.Hl, # attn_cls= models.AttentionModuleAcumulated, # init_orth = args.init_orth) if args.use_deep_attn: attn_cls = models.DeepAttentionModule # Creating encoder/decoder encdec = models.EncoderDecoder( Vi, args.Ei, args.Hi, Vo + 1, args.Eo, args.Ho, args.Ha, args.Hl, init_orth=args.init_orth, use_bn_length=args.use_bn_length, attn_cls=attn_cls, encoder_cell_type=args.encoder_cell_type, decoder_cell_type=args.decoder_cell_type, lexical_probability_dictionary=lexical_probability_dictionary, lex_epsilon=args.lexicon_prob_epsilon, is_multitarget=is_multitarget) if args.load_model is not None: serializers.load_npz(args.load_model, encdec) if args.gpu is not None: models_list = [] models_list.append(encdec) import copy for i in range(len(args.gpu) - 1): log.info( "Creating copy #%d of model for data parallel computation." % (i + 1)) encdec_copy = copy.deepcopy(encdec) models_list.append(encdec_copy) for i in range(len(args.gpu)): models_list[i] = models_list[i].to_gpu(args.gpu[i]) assert models_list[0] == encdec #print len(models_list) if args.optimizer == "adadelta": optimizer = optimizers.AdaDelta() elif args.optimizer == "adam": optimizer = optimizers.Adam() elif args.optimizer == "adagrad": optimizer = optimizers.AdaGrad(lr=args.learning_rate) elif args.optimizer == "sgd": optimizer = optimizers.SGD(lr=args.learning_rate) elif args.optimizer == "momentum": optimizer = optimizers.MomentumSGD(lr=args.learning_rate, momentum=args.momentum) elif args.optimizer == "nesterov": optimizer = optimizers.NesterovAG(lr=args.learning_rate, momentum=args.momentum) elif args.optimizer == "rmsprop": optimizer = optimizers.RMSprop(lr=args.learning_rate) elif args.optimizer == "rmspropgraves": optimizer = optimizers.RMSpropGraves(lr=args.learning_rate, momentum=args.momentum) else: raise NotImplemented with cuda.get_device(args.gpu): optimizer.setup(encdec) if args.l2_gradient_clipping is not None and args.l2_gradient_clipping > 0: optimizer.add_hook( chainer.optimizer.GradientClipping(args.l2_gradient_clipping)) if args.hard_gradient_clipping is not None and args.hard_gradient_clipping > 0: optimizer.add_hook( chainer.optimizer.GradientHardClipping( *args.hard_gradient_clipping)) if args.weight_decay is not None: optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) if args.load_optimizer_state is not None: with cuda.get_device(args.gpu): serializers.load_npz(args.load_optimizer_state, optimizer) with cuda.get_device(args.gpu[0]): # with MyTimerHook() as timer: # try: train_on_data( encdec, optimizer, training_data, output_files_dict, src_indexer, tgt_indexer, eos_idx=eos_idx, mb_size=args.mb_size, nb_of_batch_to_sort=args.nb_batch_to_sort * len(args.gpu), test_data=test_data, dev_data=dev_data, valid_data=valid_data, gpu=args.gpu, report_every=args.report_every, randomized=args.randomized_data, reverse_src=args.reverse_src, reverse_tgt=args.reverse_tgt, max_nb_iters=args.max_nb_iters, do_not_save_data_for_resuming=args.no_resume, noise_on_prev_word=args.noise_on_prev_word, curiculum_training=args.curiculum_training, use_previous_prediction=args.use_previous_prediction, no_report_or_save=args.no_report_or_save, use_memory_optimization=args.use_memory_optimization, sample_every=args.sample_every, use_reinf=args.use_reinf, save_ckpt_every=args.save_ckpt_every, postprocess=args.postprocess, models_list=models_list # lexical_probability_dictionary = lexical_probability_dictionary, # V_tgt = Vo + 1, # lexicon_prob_epsilon = args.lexicon_prob_epsilon ) # finally: # print timer # timer.print_sorted() # print "total time:" # print(timer.total_time()) import sys sys.exit(0) import training_chainer with cuda.get_device(args.gpu): training_chainer.train_on_data_chainer( encdec, optimizer, training_data, output_files_dict, src_indexer, tgt_indexer, eos_idx=eos_idx, output_dir=args.save_prefix, stop_trigger=None, mb_size=args.mb_size, nb_of_batch_to_sort=args.nb_batch_to_sort, test_data=test_data, dev_data=dev_data, valid_data=valid_data, gpu=args.gpu, report_every=args.report_every, randomized=args.randomized_data, reverse_src=args.reverse_src, reverse_tgt=args.reverse_tgt, max_nb_iters=args.max_nb_iters, do_not_save_data_for_resuming=args.no_resume, noise_on_prev_word=args.noise_on_prev_word, curiculum_training=args.curiculum_training, use_previous_prediction=args.use_previous_prediction, no_report_or_save=args.no_report_or_save, use_memory_optimization=args.use_memory_optimization, sample_every=args.sample_every, use_reinf=args.use_reinf, save_ckpt_every=args.save_ckpt_every, postprocess=args.postprocess # lexical_probability_dictionary = lexical_probability_dictionary, # V_tgt = Vo + 1, # lexicon_prob_epsilon = args.lexicon_prob_epsilon )
def main(): parser = ArgumentParser() parser.add_argument('train_data', help='train data') parser.add_argument('train_labels', help='train labels') parser.add_argument('--val-data', default=None, help='val data') parser.add_argument('--val-labels', default=None, help='val labels') parser.add_argument('-b', '--batch-size', type=int, default=5, help='mini-batch size (default=5)') parser.add_argument('--beta2', type=float, default=0.999, help='beta2 of Adam (default=0.999)') parser.add_argument('-g', '--gpu-id', type=int, default=-1, help='GPU ID (default=-1, indicates CPU)') parser.add_argument('--ignore-labels', type=int, default=[], nargs='+', help='labels to ignore (default=[])') parser.add_argument('-l', '--learning-rate', type=float, default=0.1, help='learning rate (default=0.1)') parser.add_argument('--max-iter', type=int, default=160000, help='train model up to max-iter (default=160000)') parser.add_argument( '--mean-interval', type=int, default=1000, help='calculate mean of train/loss (and validation loss) ' + 'every mean-interval iters (default=1000)') parser.add_argument('--model', default=None, help='resume to train the model') parser.add_argument('--momentum', type=float, default=0.9, help='momentum rate (default=0.9)') parser.add_argument('--n-classes', type=int, default=5, help='number of classes (default=5)') parser.add_argument('--noise', default='no', help='noise injection method. \'no\', \'patch\', ' + 'and \'permutation\' are available (default=\'no\')') parser.add_argument('--optim', default='nesterov', help='optimization method. \'sgd\', \'nesterov\', ' + 'and \'adam\' are available (default=\'nesterov\')') parser.add_argument( '-o', '--outdir', default='./', help='trained models and optimizer states are stored in outdir ' + '(default=\'./\')') parser.add_argument( '--queue-maxsize', type=int, default=10, help='maxsize of queues for training and validation (default=10)') parser.add_argument( '--save-interval', type=int, default=10000, help='save model & optimizer every save-interval iters (default=10000)' ) parser.add_argument( '--state', default=None, help='optimizer state. resume to train the model with the optimizer') parser.add_argument('-w', '--weight-decay', type=float, default=1e-4, help='weight decay factor (default=1e-4)') args = parser.parse_args() print(argv2string(sys.argv) + '\n') for arg in dir(args): if arg[:1] == '_': continue print('{} = {}'.format(arg, getattr(args, arg))) print() if not os.path.isdir(args.outdir): os.makedirs(args.outdir) print('mkdir ' + args.outdir + '\n') model = Model(in_ch=3, out_ch=args.n_classes) if args.model is not None: S.load_npz(args.model, model) loss_func = Loss(model) if args.optim.lower() in 'sgd': if args.momentum > 0: optim = optims.CorrectedMomentumSGD(lr=args.learning_rate, momentum=args.momentum) else: optim = optims.SGD(lr=args.learning_rate) elif args.optim.lower() in 'nesterovag': optim = optims.NesterovAG(lr=args.learning_rate, momentum=args.momentum) elif args.optim.lower() in 'adam': optim = optims.Adam(alpha=args.learning_rate, beta1=args.momentum, beta2=args.beta2, weight_decay_rate=args.weight_decay, amsgrad=True) else: raise ValueError('Please specify an available optimizer name.\n' + 'SGD, NesterovAG, and Adam are available.') print('{}\n'.format(type(optim))) optim.setup(model) if args.state is not None: S.load_npz(args.state, optim) if (args.weight_decay > 0) and not isinstance(optim, optims.Adam): optim.add_hook(WeightDecay(args.weight_decay)) optim.add_hook(GradientClipping(1)) lr_decay_iter_dict = { int(5 * args.max_iter / 8): 0.1, int(7 * args.max_iter / 8): 0.1, } with open(args.train_data, 'r') as f: train_data_path_list = [line.strip() for line in f.readlines()] with open(args.train_labels, 'r') as f: train_labels_path_list = [line.strip() for line in f.readlines()] assert len(train_data_path_list) == len(train_labels_path_list) if (args.val_data is not None) or (args.val_labels is not None): if (args.val_data is not None) and (args.val_labels is not None): with open(args.val_data, 'r') as f: val_data_path_list = [line.strip() for line in f.readlines()] with open(args.val_labels, 'r') as f: val_labels_path_list = [line.strip() for line in f.readlines()] assert len(val_data_path_list) == len(val_labels_path_list) else: raise ValueError('Either val_data or val_labels is not specified.') train_queue = mp.Queue(maxsize=args.queue_maxsize) train_generator = BatchGenerator(args.batch_size, train_data_path_list, train_labels_path_list, train_queue, train=True, noise_injection=args.noise, out_height=512, out_width=512, max_height=1216, max_width=1216, min_height=832, min_width=832) train_generator.start() if args.val_data is None: val_queue = None else: val_queue = mp.Queue(maxsize=args.queue_maxsize) try: val_generator = BatchGenerator(1, val_data_path_list, val_labels_path_list, val_queue, train=False, out_height=608, out_width=968) val_generator.start() except Exception: train_generator.terminate() train_queue.close() val_queue.close() raise try: train(loss_func, optim, train_queue, args.max_iter, args.mean_interval, args.save_interval, val_queue, lr_decay_iter_dict, args.gpu_id, args.ignore_labels, args.outdir) except BaseException: train_generator.terminate() train_queue.close() if val_queue is not None: val_generator.terminate() val_queue.close() raise train_generator.terminate() train_queue.close() if val_queue is not None: val_generator.terminate() val_queue.close()
n_symbol = 5 n_epoch = 2 # number of epochs SIL_idx = 0 # index of blank symbol grad_clip = 10 # gradient norm threshold to clip 较大的值,模型收敛的较快 model = RNNASR(n_feature, n_units, n_symbol) ## use GPU or not useGPU = False xp = cuda.cupy if useGPU else np if useGPU: cuda.get_device(0).use() model.to_gpu() ## Setup optimizer optimizer = optimizers.NesterovAG() # 比RMSpropGraves快一些 #optimizer = optimizers.RMSpropGraves() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip)) # 必不可少的 # train loop counter = 0 trainsize = len(trainset) for epoch in range(1, n_epoch + 1): indexes = np.random.permutation(trainsize) for i in range(trainsize): x_data = os.path.join(data_root, trainset[indexes[i]][0]) y_data = trainset[indexes[i]][1]