def train(args): """Train with the given args :param Namespace args: The program arguments """ set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]['output'][1]['shape'][1]) odim = int(valid_json[utts[0]]['output'][0]['shape'][1]) logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # specify model architecture model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, MTInterface) if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch.load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write(json.dumps((idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu))) if args.batch_size != 0: logging.info('batch size is automatically increased (%d -> %d)' % ( args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # Setup an optimizer if args.opt == 'adadelta': optimizer = torch.optim.Adadelta( model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay) elif args.opt == 'adam': optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay) elif args.opt == 'noam': from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt(model, args.adim, args.transformer_warmup_steps, args.transformer_lr) else: raise NotImplementedError("unknown optimizer: " + args.opt) # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter(idim=idim) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, mt=True, iaxis=1, oaxis=0) valid = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, mt=True, iaxis=1, oaxis=0) load_tr = LoadInputsAndTargets( mode='mt', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': True} # Switch the mode of preprocessing ) load_cv = LoadInputsAndTargets( mode='mt', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': False} # Switch the mode of preprocessing ) # hack to make batchsize argument as 1 # actual bathsize is included in a list if args.n_iter_processes > 0: train_iter = ToggleableShufflingMultiprocessIterator( TransformDataset(train, load_tr), batch_size=1, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20, shuffle=not use_sortagrad) valid_iter = ToggleableShufflingMultiprocessIterator( TransformDataset(valid, load_cv), batch_size=1, repeat=False, shuffle=False, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20) else: train_iter = ToggleableShufflingSerialIterator( TransformDataset(train, load_tr), batch_size=1, shuffle=not use_sortagrad) valid_iter = ToggleableShufflingSerialIterator( TransformDataset(valid, load_cv), batch_size=1, repeat=False, shuffle=False) # Set up a trainer updater = CustomUpdater( model, args.grad_clip, train_iter, optimizer, converter, device, args.ngpu, args.accum_grad) trainer = training.Trainer( updater, (args.epochs, 'epoch'), out=args.outdir) if use_sortagrad: trainer.extend(ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, 'epoch')) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend(CustomEvaluator(model, valid_iter, reporter, converter, device)) # Save attention weight each epoch if args.num_save_attention > 0: # sort it by output lengths data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['output'][0]['shape'][0]), reverse=True) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, ikey="output", iaxis=1) trainer.extend(att_reporter, trigger=(1, 'epoch')) else: att_reporter = None # Make a plot for training and validation values trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss', 'main/loss_att', 'validation/main/loss_att'], 'epoch', file_name='loss.png')) trainer.extend(extensions.PlotReport(['main/acc', 'validation/main/acc'], 'epoch', file_name='acc.png')) trainer.extend(extensions.PlotReport(['main/ppl', 'validation/main/ppl'], 'epoch', file_name='ppl.png')) # Save best models trainer.extend(snapshot_object(model, 'model.loss.best'), trigger=training.triggers.MinValueTrigger('validation/main/loss')) trainer.extend(snapshot_object(model, 'model.acc.best'), trigger=training.triggers.MaxValueTrigger('validation/main/acc')) # save snapshot which contains model and optimizer states trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # epsilon decay in the optimizer if args.opt == 'adadelta': if args.criterion == 'acc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(args.report_interval_iters, 'iteration'))) report_keys = ['epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/acc', 'validation/main/acc', 'main/ppl', 'validation/main/ppl', 'elapsed_time'] if args.opt == 'adadelta': trainer.extend(extensions.observe_value( 'eps', lambda trainer: trainer.updater.get_optimizer('main').param_groups[0]["eps"]), trigger=(args.report_interval_iters, 'iteration')) report_keys.append('eps') trainer.extend(extensions.PrintReport( report_keys), trigger=(args.report_interval_iters, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": writer = SummaryWriter(args.tensorboard_dir) trainer.extend(TensorboardLogger(writer, att_reporter), trigger=(args.report_interval_iters, 'iteration')) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
max_epoch = 10 trainer = training.Trainer(updater, (max_epoch, 'epoch'), out='result') trainer.extend(extensions.LogReport()) trainer.extend(extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}')) trainer.extend(extensions.Evaluator(valid_iter, net, device=gpu_id), name='val') trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'val/main/loss', 'val/main/accuracy', 'l1/W/data/std', 'elapsed_time' ])) trainer.extend( extensions.ParameterStatistics(net.predictor.l1, {'std': np.std})) trainer.extend( extensions.PlotReport(['l1/W/data/std'], x_key='epoch', file_name='std.png')) trainer.extend( extensions.PlotReport(['main/loss', 'val/main/loss'], x_key='epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/accuracy', 'val/main/accuracy'], x_key='epoch', file_name='accuracy.png')) trainer.extend(extensions.dump_graph('main/loss')) trainer.run()
def train(args): '''Run training''' # seed setting torch.manual_seed(args.seed) # debug mode setting # 0 would be fastest, but 1 seems to be reasonable # by considering reproducability # revmoe type check if args.debugmode < 2: chainer.config.type_check = False logging.info('torch type check is disabled') # use determinisitic computation or not if args.debugmode < 1: torch.backends.cudnn.deterministic = False logging.info('torch cudnn deterministic is disabled') else: torch.backends.cudnn.deterministic = True # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_label, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]['idim']) odim = int(valid_json[utts[0]]['odim']) logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = 'ctc' logging.info('Pure CTC mode') elif args.mtlalpha == 0.0: mtl_mode = 'att' logging.info('Pure attention mode') else: mtl_mode = 'mtl' logging.info('Multitask learning mode') # specify model architecture e2e = E2E(idim, odim, args) model = Loss(e2e, args.mtlalpha) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.conf' with open(model_conf, 'wb') as f: logging.info('writing a model config file to' + model_conf) # TODO(watanabe) use others than pickle, possibly json, and save as a text pickle.dump((idim, odim, args), f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # Set gpu reporter = model.reporter ngpu = args.ngpu if ngpu == 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) model.cuda() elif ngpu > 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) model = DataParallel(model, device_ids=gpu_id) model.cuda() logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu else: gpu_id = [-1] # Setup an optimizer if args.opt == 'adadelta': optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95, eps=args.eps) elif args.opt == 'adam': optimizer = torch.optim.Adam(model.parameters()) # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # read json data with open(args.train_label, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_label, 'rb') as f: valid_json = json.load(f)['utts'] # make minibatch list (variable length) train = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches) valid = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches) # hack to make batchsze argument as 1 # actual bathsize is included in a list train_iter = chainer.iterators.SerialIterator(train, 1) valid_iter = chainer.iterators.SerialIterator(valid, 1, repeat=False, shuffle=False) # prepare Kaldi reader train_reader = lazy_io.read_dict_scp(args.train_feat) valid_reader = lazy_io.read_dict_scp(args.valid_feat) # Set up a trainer updater = PytorchSeqUpdaterKaldi(model, args.grad_clip, train_iter, optimizer, train_reader, gpu_id) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: chainer.serializers.load_npz(args.resume, trainer) if ngpu > 1: model.module.load_state_dict( torch.load(args.outdir + '/model.acc.best')) else: model.load_state_dict(torch.load(args.outdir + '/model.acc.best')) model = trainer.updater.model # Evaluate the model with the test dataset for each epoch trainer.extend( PytorchSeqEvaluaterKaldi(model, valid_iter, reporter, valid_reader, device=gpu_id)) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0: data = sorted(valid_json.items()[:args.num_save_attention], key=lambda x: int(x[1]['ilen']), reverse=True) data = converter_kaldi(data, valid_reader) trainer.extend(PlotAttentionReport(model, data, args.outdir + "/att_ws"), trigger=(1, 'epoch')) # Take a snapshot for each specified epoch trainer.extend(extensions.snapshot(), trigger=(1, 'epoch')) # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/loss_ctc', 'validation/main/loss_ctc', 'main/loss_att', 'validation/main/loss_att' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/acc', 'validation/main/acc'], 'epoch', file_name='acc.png')) # Save best models def torch_save(path, _): if ngpu > 1: torch.save(model.module.state_dict(), path) torch.save(model.module, path + ".pkl") else: torch.save(model.state_dict(), path) torch.save(model, path + ".pkl") trainer.extend( extensions.snapshot_object(model, 'model.loss.best', savefun=torch_save), trigger=training.triggers.MinValueTrigger('validation/main/loss')) if mtl_mode is not 'ctc': trainer.extend( extensions.snapshot_object(model, 'model.acc.best', savefun=torch_save), trigger=training.triggers.MaxValueTrigger('validation/main/acc')) # epsilon decay in the optimizer def torch_load(path, obj): if ngpu > 1: model.module.load_state_dict(torch.load(path)) else: model.load_state_dict(torch.load(path)) return obj if args.opt == 'adadelta': if args.criterion == 'acc' and mtl_mode is not 'ctc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(100, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'main/loss', 'main/loss_ctc', 'main/loss_att', 'validation/main/loss', 'validation/main/loss_ctc', 'validation/main/loss_att', 'main/acc', 'validation/main/acc', 'elapsed_time' ] if args.opt == 'adadelta': trainer.extend(extensions.observe_value( 'eps', lambda trainer: trainer.updater.get_optimizer('main'). param_groups[0]["eps"]), trigger=(100, 'iteration')) report_keys.append('eps') trainer.extend(extensions.PrintReport(report_keys), trigger=(100, 'iteration')) trainer.extend(extensions.ProgressBar()) # Run the training trainer.run()
def main(config_path): # Init args args = parse_config(config_path) # Load sentences train_sentences = load_sentences(args["path_train"], args["replace_digit"]) dev_sentences = load_sentences(args["path_dev"], args["replace_digit"]) # Update tagging scheme (IOB/IOBES) update_tag_scheme(train_sentences, args["tag_scheme"]) update_tag_scheme(dev_sentences, args["tag_scheme"]) # Create a dictionary / mapping of words if args['path_pre_emb']: dico_words_train = word_mapping(train_sentences, args["lowercase"])[0] dico_words, word_to_id, id_to_word, pretrained = augment_with_pretrained( dico_words_train.copy(), args['path_pre_emb'], list(itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences]))) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, args["lowercase"]) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences + dev_sentences) dico_entities, entity_to_id, id_to_entity = entity_mapping(train_sentences + dev_sentences) # Set id of tag 'O' as 0 in order to make it easier for padding # Resort id_to_tag id_to_tag, tag_to_id = entity_tags(id_to_entity) if args["use_singletons"]: singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) else: singletons = None # Index data train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, singletons, args["lowercase"]) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, None, args["lowercase"]) print("%i / %i sentences in train / dev." % (len(train_data), len(dev_data))) # Init model model = Model(len(word_to_id), len(char_to_id), len(tag_to_id), args) if args['gpus']['main'] >= 0: cuda.get_device_from_id(args['gpus']['main']).use() model.to_gpu() print('Saving the mappings to disk...') model.save_mappings(id_to_word, id_to_char, id_to_tag, args) if args['path_pre_emb']: print("Loading pretrained embedding...") model.load_pretrained(args['path_pre_emb']) result_path = '../result/' # Init Iterators train_iter = chainer.iterators.SerialIterator(train_data, model.batch_size) dev_iter = chainer.iterators.SerialIterator(dev_data, model.batch_size, repeat=False) # Reset cost matrix id_to_tag = model.id_to_tag cost = model.crf.cost.data model.crf.cost.data = load_cost_matrix(id_to_tag, cost) # Init Optimizer optimizer = chainer.optimizers.Adam(model.lr_param) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(model.threshold)) optimizer.add_hook(chainer.optimizer.WeightDecay(model.decay_rate)) # Init early_stopping_trigger early_stopping_trigger = EarlyStoppingTrigger(args["epoch"], key='dev/main/fscore', eps=args["early_stopping_eps"], early_stopping=args["early_stopping"]) # Init Updater, Trainer and Evaluator updater = Updater(train_iter, optimizer, args['gpus']) trainer = training.Trainer(updater, stop_trigger=early_stopping_trigger, out=result_path) trainer.extend(Evaluator(dev_iter, optimizer.target, args['gpus'])) # Save the best model trainer.extend(extensions.snapshot_object(model, 'model_iter_{.updater.iteration}'), trigger=training.triggers.MaxValueTrigger('dev/main/fscore')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'dev/main/loss', 'main/accuracy', 'dev/main/accuracy', 'elapsed_time'])) if extensions.PlotReport.available(): # Plot graph for loss,accuracy and fscore for each epoch trainer.extend(extensions.PlotReport(['main/loss', 'dev/main/loss'], x_key='epoch', file_name='loss.png')) trainer.extend(extensions.PlotReport(['main/accuracy', 'dev/main/accuracy'], x_key='epoch', file_name='accuracy.png')) trainer.extend(extensions.PlotReport(['dev/main/fscore'], x_key='epoch', file_name='fscore.png')) trainer.run()
def train(args): '''Run training''' # display chainer version logging.info('chainer version = ' + chainer.__version__) # seed setting (chainer seed may not need it) os.environ['CHAINER_SEED'] = str(args.seed) logging.info('chainer seed = ' + os.environ['CHAINER_SEED']) # debug mode setting # 0 would be fastest, but 1 seems to be reasonable # by considering reproducability # revmoe type check if args.debugmode < 2: chainer.config.type_check = False logging.info('chainer type check is disabled') # use determinisitic computation or not if args.debugmode < 1: chainer.config.cudnn_deterministic = False logging.info('chainer cudnn deterministic is disabled') else: chainer.config.cudnn_deterministic = True # check cuda and cudnn availability if not chainer.cuda.available: logging.warning('cuda is not available') if not chainer.cuda.cudnn_enabled: logging.warning('cudnn is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]['input'][0]['shape'][1]) odim = int(valid_json[utts[0]]['output'][0]['shape'][1]) logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # check attention type if args.atype not in ['noatt', 'dot', 'location']: raise NotImplementedError( 'chainer supports only noatt, dot, and location attention.') # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = 'ctc' logging.info('Pure CTC mode') elif args.mtlalpha == 0.0: mtl_mode = 'att' logging.info('Pure attention mode') else: mtl_mode = 'mtl' logging.info('Multitask learning mode') # specify model architecture e2e = E2E(idim, odim, args) model = Loss(e2e, args.mtlalpha) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # Set gpu ngpu = args.ngpu if ngpu == 1: gpu_id = 0 # Make a specified GPU current chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() # Copy the model to the GPU logging.info('single gpu calculation.') elif ngpu > 1: gpu_id = 0 devices = {'main': gpu_id} for gid in six.moves.xrange(1, ngpu): devices['sub_%d' % gid] = gid logging.info('multi gpu calculation (#gpus = %d).' % ngpu) logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) else: gpu_id = -1 logging.info('cpu calculation') # Setup an optimizer if args.opt == 'adadelta': optimizer = chainer.optimizers.AdaDelta(eps=args.eps) elif args.opt == 'adam': optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] # set up training iterator and updater converter = CustomConverter(e2e.subsample[0]) if ngpu <= 1: # make minibatch list (variable length) train = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches) # hack to make batchsize argument as 1 # actual batchsize is included in a list train_iter = chainer.iterators.MultiprocessIterator( TransformDataset(train, converter.transform), 1, n_processes=1, n_prefetch=8, maxtasksperchild=20) # set up updater updater = CustomUpdater(train_iter, optimizer, converter=converter, device=gpu_id) else: # set up minibatches train_subsets = [] for gid in six.moves.xrange(ngpu): # make subset train_json_subset = { k: v for i, (k, v) in enumerate(train_json.items()) if i % ngpu == gid } # make minibatch list (variable length) train_subsets += [ make_batchset(train_json_subset, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches) ] # each subset must have same length for MultiprocessParallelUpdater maxlen = max([len(train_subset) for train_subset in train_subsets]) for train_subset in train_subsets: if maxlen != len(train_subset): for i in six.moves.xrange(maxlen - len(train_subset)): train_subset += [train_subset[i]] # hack to make batchsize argument as 1 # actual batchsize is included in a list train_iters = [ chainer.iterators.MultiprocessIterator(TransformDataset( train_subsets[gid], converter.transform), 1, n_processes=1, n_prefetch=8, maxtasksperchild=20) for gid in six.moves.xrange(ngpu) ] # set up updater updater = CustomParallelUpdater(train_iters, optimizer, converter=converter, devices=devices) # Set up a trainer trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: chainer.serializers.load_npz(args.resume, trainer) # set up validation iterator valid = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches) valid_iter = chainer.iterators.SerialIterator(TransformDataset( valid, converter.transform), 1, repeat=False, shuffle=False) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(valid_iter, model, converter=converter, device=gpu_id)) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) if hasattr(model, "module"): att_vis_fn = model.module.predictor.calculate_all_attentions else: att_vis_fn = model.predictor.calculate_all_attentions trainer.extend(PlotAttentionReport(att_vis_fn, data, args.outdir + "/att_ws", converter=converter, device=gpu_id), trigger=(1, 'epoch')) # Take a snapshot for each specified epoch trainer.extend( extensions.snapshot(filename='snapshot.ep.{.updater.epoch}'), trigger=(1, 'epoch')) # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/loss_ctc', 'validation/main/loss_ctc', 'main/loss_att', 'validation/main/loss_att' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/acc', 'validation/main/acc'], 'epoch', file_name='acc.png')) # Save best models trainer.extend( extensions.snapshot_object(model, 'model.loss.best'), trigger=training.triggers.MinValueTrigger('validation/main/loss')) if mtl_mode is not 'ctc': trainer.extend( extensions.snapshot_object(model, 'model.acc.best'), trigger=training.triggers.MaxValueTrigger('validation/main/acc')) # epsilon decay in the optimizer if args.opt == 'adadelta': if args.criterion == 'acc' and mtl_mode is not 'ctc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best'), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best'), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(REPORT_INTERVAL, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'main/loss', 'main/loss_ctc', 'main/loss_att', 'validation/main/loss', 'validation/main/loss_ctc', 'validation/main/loss_att', 'main/acc', 'validation/main/acc', 'elapsed_time' ] if args.opt == 'adadelta': trainer.extend(extensions.observe_value( 'eps', lambda trainer: trainer.updater.get_optimizer('main').eps), trigger=(REPORT_INTERVAL, 'iteration')) report_keys.append('eps') trainer.extend(extensions.PrintReport(report_keys), trigger=(REPORT_INTERVAL, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=REPORT_INTERVAL)) # Run the training trainer.run()
def main(arg_list=None): parser = argparse.ArgumentParser(description='Chainer LSTM') parser.add_argument('--epoch', '-e', type=int, nargs='+', default=[20], help='Number of sweeps over the dataset to train') parser.add_argument('--optimizer', '-o', nargs='+', default=['momentumsgd'], help='Optimizer (sgd, momentumsgd, adam)') parser.add_argument('--batchsize', '-b', type=int, nargs='+', default=[128], help='Number of training points in each mini-batch') parser.add_argument('--lr', type=float, nargs='+', default=[1e-2, 1e-3, 1e-4, 1e-5], help='Learning rate') parser.add_argument( '--network', '-n', default='ff', help= 'Neural network type, either "ff", "tdnn", "lstm", "zoneoutlstm", "peepholelstm" or "gru". Setting any recurrent network implies "--shuffle-sequences"' ) parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--units', '-u', type=int, nargs='+', default=[1024], help='Number of units') parser.add_argument('--layers', '-l', type=int, default=2, help='Number of hidden layers') parser.add_argument('--activation', '-a', default='relu', help='FF activation function (sigmoid, tanh or relu)') parser.add_argument('--tdnn-ksize', type=int, nargs='+', default=[5], help='TDNN kernel size') parser.add_argument('--bproplen', type=int, default=20, help='Backpropagation length') parser.add_argument('--timedelay', type=int, default=0, help='Delay target values by this many time steps') parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') parser.add_argument('--splice', type=int, default=0, help='Splicing size') parser.add_argument( '--dropout', '-d', type=float, nargs='+', default=[0], help= 'Dropout rate (0 to disable). In case of Zoneout LSTM, this parameter has 2 arguments: c_ratio h_ratio' ) parser.add_argument('--ft', default='final.feature_transform', help='Kaldi feature transform file') parser.add_argument('--tri', action='store_true', help='Use triphones') parser.add_argument( '--shuffle-sequences', action='store_true', help= 'True if sequences should be shuffled as a whole, otherwise all frames will be shuffled independent of each other' ) parser.add_argument( '--data-dir', default='data/fmllr', help= 'Data directory, this will be prepended to data files and feature transform' ) parser.add_argument( '--offset-dir', default='data', help='Data directory, this will be prepended to offset files') parser.add_argument( '--target-dir', default='data/targets', help='Data directory, this will be prepended to target files') parser.add_argument( '--ivector-dir', help='Data directory, this will be prepended to ivector files') parser.add_argument('--data', default='data_{}.npy', help='Training data') parser.add_argument('--offsets', default='offsets_{}.npy', help='Training offsets') parser.add_argument('--targets', default='targets_{}.npy', help='Training targets') parser.add_argument('--ivectors', default='ivectors_{}.npy', help='Training ivectors') parser.add_argument('--no-validation', dest='use_validation', action='store_false', help='Do not evaluate validation data while training') parser.add_argument('--train-fold', type=int, help='Train fold network with this ID') parser.add_argument('--train-rpl', action='store_true', help='Train RPL layer') parser.add_argument('--rpl-model', default="result_rpl/model", help='RPL layer model') parser.add_argument('--fold-data-dir', default="fold_data", help='Directory with fold input data') parser.add_argument('--fold-output-dir', default="fold_data_out", help='Directory with predicted fold output') parser.add_argument('--fold-model-dir', default="fold_models", help='Directory with output fold model') parser.add_argument( '--fold-data-pattern', default='data_{0}.npy', help= 'Filename pattern of each fold data, {0} will be replaced by fold ID') parser.add_argument('--fold-offset-pattern', default='offsets_{0}.npy', help='Filename pattern of each fold offset') parser.add_argument('--fold-target-pattern', default='targets_{0}.npy', help='Filename pattern of each fold targets') parser.add_argument( '--fold-ivector-pattern', default='ivectors_{}.npy', help= 'Filename pattern of each fold i-vectors file, {} will be replaced by fold ID' ) parser.add_argument('--fold-output-pattern', default='data_{0}.npy', help='Filename pattern of each fold network output') parser.add_argument('--fold-network-pattern', default='fold_{0}.npz', help='Filename pattern of each fold network') parser.add_argument('--no-progress', action='store_true', help='Disable progress bar') if arg_list is not None: args = parser.parse_args(list(map(str, arg_list))) else: args = parser.parse_args() # set options implied by other options if is_nn_recurrent(args.network): args.shuffle_sequences = True # create output directories Path(args.out).mkdir(exist_ok=True, parents=True) if args.train_fold is not None: file_out = Path(args.fold_model_dir, args.fold_network_pattern.format(args.train_fold)) Path(file_out.parent).mkdir(exist_ok=True, parents=True) # print arguments to the file with open(args.out + "/args.txt", "w") as f: for attr in dir(args): if not attr.startswith('_'): f.write('# {}: {}\n'.format(attr, getattr(args, attr))) f.write(' '.join( map(lambda x: "'" + x + "'" if ' ' in x else x, sys.argv)) + '\n') # print arguments to stdout for attr in dir(args): if not attr.startswith('_'): print('# {}: {}'.format(attr, getattr(args, attr))) print('') # input feature vector length num_classes = 1909 if args.tri else 39 # create model if args.train_rpl: model = RPL4(num_classes) model_cls = L.Classifier(model) else: if args.activation == "sigmoid": activation = F.sigmoid elif args.activation == "tanh": activation = F.tanh elif args.activation == "relu": activation = F.relu else: print("Wrong activation function specified") return model = get_nn(args.network, args.layers, args.units, num_classes, activation, args.tdnn_ksize, args.dropout) # classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. model_cls = L.Classifier(model) if args.gpu >= 0: # make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model_cls.to_gpu() # copy the model to the GPU offsets = offsets_dev = None if args.train_rpl: # load training data fold = 0 x = [] y = [] while True: x_file = Path(args.fold_output_dir, args.fold_output_pattern.format(fold)) y_file = Path(args.fold_data_dir, args.fold_target_pattern.format(fold)) if not x_file.is_file() or not y_file.is_file(): break print("Loading fold {} data".format(fold)) x_ = np.load(str(x_file)) y_ = np.load(str(y_file)) x.append(x_) y.append(y_) fold += 1 if fold == 0: print("Error: No fold data found") return x = np.concatenate(x, axis=0) y = np.concatenate(y, axis=0) if args.use_validation: #TODO: use args.data instead of args.dev_data x_dev = np.load(str(Path(args.data_dir, args.data.format("dev")))) # offsets_dev = loadBin(str(Path(args.datadir, args.dev_offsets)), np.int32) y_dev = np.load( str(Path(args.target_dir, args.targets.format("dev")))) else: # load training data ivectors = None ivectors_dev = None if args.train_fold is not None: x = [] offsets = [0] y = [] ivectors = [] num = 0 fold = 0 while True: if fold != args.train_fold: x_file = Path(args.fold_data_dir, args.fold_data_pattern.format(fold)) if not x_file.is_file(): break offsets_file = Path(args.fold_data_dir, args.fold_offset_pattern.format(fold)) y_file = Path(args.fold_data_dir, args.fold_target_pattern.format(fold)) if args.ivector_dir is not None: ivectors_file = Path( args.fold_data_dir, args.fold_ivector_pattern.format(fold)) if not ivectors_file.is_file(): print("Error: missing ivectors for fold data {}". format(fold)) return print("Loading fold {} data".format(fold)) x_fold = np.load(str(x_file)) x.append(x_fold) if is_nn_recurrent(args.network): offsets_fold = np.load(str(offsets_file)) offsets.extend(offsets_fold[1:] + num) y_fold = np.load(str(y_file)) y.append(y_fold) if args.ivector_dir is not None: ivectors_fold = np.load(str(ivectors_file)) ivectors.append(ivectors_fold) num += x_fold.shape[0] fold += 1 if len(x) == 0: print("Error: No fold data found") return x = np.concatenate(x, axis=0) if is_nn_recurrent(args.network): offsets = np.array(offsets, dtype=np.int32) y = np.concatenate(y, axis=0) if args.ivector_dir is not None: ivectors = np.concatenate(ivectors, axis=0) else: x = np.load(str(Path(args.data_dir, args.data.format("train")))) if is_nn_recurrent(args.network): offsets = np.load( str(Path(args.offset_dir, args.offsets.format("train")))) y = np.load( str(Path(args.target_dir, args.targets.format("train")))) if args.ivector_dir is not None: ivectors = np.load( str(Path(args.ivector_dir, args.ivectors.format("train")))) if args.use_validation: x_dev = np.load(str(Path(args.data_dir, args.data.format("dev")))) if is_nn_recurrent(args.network): offsets_dev = np.load( str(Path(args.offset_dir, args.offsets.format("dev")))) y_dev = np.load( str(Path(args.target_dir, args.targets.format("dev")))) if args.ivector_dir is not None: ivectors_dev = np.load( str(Path(args.ivector_dir, args.ivectors.format("dev")))) # apply splicing if args.network == "tdnn": splice = (sum(args.tdnn_ksize) - len(args.tdnn_ksize)) // 2 else: splice = args.splice if splice > 0: x = splicing(x, range(-splice, splice + 1)) x_dev = splicing(x_dev, range(-splice, splice + 1)) # load feature transform if not args.ft and args.ft != '-': ft = loadKaldiFeatureTransform(str(Path(args.data_dir, args.ft))) if is_nn_recurrent( args.network ): # select transform middle frame if the network is recurrent dim = ft["shape"][1] zi = ft["shifts"].index(0) ft["rescale"] = ft["rescale"][zi * dim:(zi + 1) * dim] ft["addShift"] = ft["addShift"][zi * dim:(zi + 1) * dim] ft["shape"][0] = dim ft["shifts"] = [0] elif args.network == "tdnn": dim = ft["shape"][1] zi = ft["shifts"].index(0) winlen = 2 * splice + 1 ft["rescale"] = np.tile(ft["rescale"][zi * dim:(zi + 1) * dim], winlen) ft["addShift"] = np.tile( ft["addShift"][zi * dim:(zi + 1) * dim], winlen) ft["shape"][0] = dim * winlen ft["shifts"] = list(range(-splice, splice + 1)) # apply feature transform x = applyKaldiFeatureTransform(x, ft) if args.use_validation: x_dev = applyKaldiFeatureTransform(x_dev, ft) if ivectors is not None: x = np.concatenate((x, ivectors), axis=1) if ivectors_dev is not None: x_dev = np.concatenate((x_dev, ivectors_dev), axis=1) # shift the input dataset according to time delay if is_nn_recurrent(args.network) and args.timedelay != 0: x, y, offsets = apply_time_delay(x, y, offsets, args.timedelay) if args.use_validation: x_dev, y_dev, offsets_dev = apply_time_delay( x_dev, y_dev, offsets_dev, args.timedelay) # create chainer datasets train_dataset = chainer.datasets.TupleDataset(x, y) if args.use_validation: dev_dataset = chainer.datasets.TupleDataset(x_dev, y_dev) # prepare train stages train_stages_len = max(len(args.batchsize), len(args.lr)) train_stages = [{ 'epoch': index_padded(args.epoch, i), 'opt': index_padded(args.optimizer, i), 'bs': index_padded(args.batchsize, i), 'lr': index_padded(args.lr, i) } for i in range(train_stages_len)] for i, ts in enumerate(train_stages): if ts['opt'] == 'adam': # learning rate not used, don't print it print( "=== Training stage {}: epoch = {}, batchsize = {}, optimizer = {}" .format(i, ts['epoch'], ts['bs'], ts['opt'])) else: print( "=== Training stage {}: epoch = {}, batchsize = {}, optimizer = {}, learning rate = {}" .format(i, ts['epoch'], ts['bs'], ts['opt'], ts['lr'])) # reset state to allow training with different batch size in each stage if not args.train_rpl and is_nn_recurrent(args.network): model.reset_state() # setup an optimizer if ts['opt'] == "sgd": optimizer = chainer.optimizers.SGD(lr=ts['lr']) elif ts['opt'] == "momentumsgd": optimizer = chainer.optimizers.MomentumSGD(lr=ts['lr']) elif ts['opt'] == "adam": optimizer = chainer.optimizers.Adam() else: print("Wrong optimizer specified: {}".format(ts['opt'])) exit(1) optimizer.setup(model_cls) if args.shuffle_sequences: train_iter = SequenceShuffleIterator(train_dataset, offsets, ts['bs']) if args.use_validation: dev_iter = SequenceShuffleIterator(dev_dataset, None, ts['bs'], repeat=False, shuffle=False) else: train_iter = SerialIterator(train_dataset, ts['bs']) if args.use_validation: dev_iter = SerialIterator(dev_dataset, ts['bs'], repeat=False, shuffle=False) # set up a trainer if is_nn_recurrent(args.network): updater = BPTTUpdater(train_iter, optimizer, args.bproplen, device=args.gpu) else: updater = StandardUpdater(train_iter, optimizer, device=args.gpu) if args.use_validation: stop_trigger = EarlyStoppingTrigger(ts['epoch'], key='validation/main/loss', eps=-0.001) else: stop_trigger = (ts['epoch'], 'epoch') trainer = training.Trainer(updater, stop_trigger, out="{}/{}".format(args.out, i)) trainer.extend(model_saver) # evaluate the model with the development dataset for each epoch if args.use_validation: trainer.extend( extensions.Evaluator(dev_iter, model_cls, device=args.gpu)) # dump a computational graph from 'loss' variable at the first iteration # the "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # take a snapshot for each specified epoch frequency = ts['epoch'] if args.frequency == -1 else max( 1, args.frequency) trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) # write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # save two plot images to the result dir if args.plot and extensions.PlotReport.available(): plot_vars_loss = ['main/loss'] plot_vars_acc = ['main/accuracy'] if args.use_validation: plot_vars_loss.append('validation/main/loss') plot_vars_acc.append('validation/main/accuracy') trainer.extend( extensions.PlotReport(plot_vars_loss, 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(plot_vars_acc, 'epoch', file_name='accuracy.png')) # print selected entries of the log to stdout # here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. if args.use_validation: print_report_vars = [ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ] else: print_report_vars = [ 'epoch', 'main/loss', 'main/accuracy', 'elapsed_time' ] trainer.extend(extensions.PrintReport(print_report_vars)) # print a progress bar to stdout # trainer.extend(extensions.ProgressBar()) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run() # load the last model if the max epoch was not reached (that means early stopping trigger stopped training # because the validation loss increased) if updater.epoch_detail < ts['epoch']: chainer.serializers.load_npz("{}/{}/model_tmp".format(args.out, i), model_cls) # remove temporary model from this training stage os.remove("{}/{}/model_tmp".format(args.out, i)) # save the final model chainer.serializers.save_npz("{}/model".format(args.out), model_cls) if args.train_fold is not None: chainer.serializers.save_npz( str( Path(args.fold_model_dir, args.fold_network_pattern.format(args.train_fold))), model_cls)
def main(): #初期設定 parser = argparse.ArgumentParser(description='Chainer') parser.add_argument('--batchsize', '-b', type=int, default=50, help='バッチサイズ') parser.add_argument('--epoch', '-e', type=int, default=20, help='エポック数') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPUの有無') parser.add_argument('--out', '-o', default='result', help='リサルトファイルのフォルダ') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=unitSize, help='中間層の数') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') ##MLPをここで引っ張る model = L.Classifier(ADPPD(args.unit, outUnit),lossfun=F.mean_squared_error)#out-10種類(0-9の数字判別のため) model.compute_accuracy = False #GPU有無の判別 if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() xp = cp print('I use GPU and cupy') ##optimizerのセット optimizer = chainer.optimizers.Adam() optimizer.setup(model) ##データセットをchainerにセット train_data = DatasetPourDot(trainQuePath,trainAnsPath) test_data = DatasetPourDot(testQuePath,testAnsPath) train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) test_iter = chainer.iterators.SerialIterator(test_data, args.batchsize, repeat=False, shuffle=False) ##updater=重みの調整、今回はStandardUpdaterを使用 updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) ##updaterをtrainerにセット trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) ##評価の際、Evaluatorを使用 trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) ##途中経過の表示用の記述 trainer.extend(extensions.dump_graph('main/loss')) frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) trainer.extend(extensions.LogReport()) if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend(extensions.ProgressBar()) #中断データの有無、あれば続きから if args.resume: chainer.serializers.load_npz(args.resume, trainer) #実験開始、trainerにお任せ trainer.run() #CPUで計算できるようにしておく model.to_cpu() #npz形式で書き出し chainer.serializers.save_npz(args.out+'/mymodel.npz', model)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-g', '--gpu', type=int, default=0) args = parser.parse_args() args.max_iteration = 10000 args.interval_eval = 1000 args.interval_print = 10 args.git_hash = instance_occlsegm_lib.utils.git_hash(__file__) args.hostname = socket.gethostname() now = datetime.datetime.now() args.timestamp = now.isoformat() args.out = osp.join(here, 'logs/train_fcn_fgbg', now.strftime('%Y%m%d_%H%M%S')) try: os.makedirs(args.out) except OSError: pass if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() data = contrib.datasets.InstanceImageDataset() class_names = data.class_names data_train = chainer.datasets.TransformDataset(data, Transform(train=True)) iter_train = chainer.iterators.SerialIterator(data_train, batch_size=1) iter_test = chainer.iterators.SerialIterator(data, batch_size=1) model = contrib.models.FCN8sAtOnce(n_class=len(class_names)) vgg16 = fcn.models.VGG16() chainer.serializers.load_npz(vgg16.pretrained_model, vgg16) model.init_from_vgg16(vgg16) model = chainercv.links.PixelwiseSoftmaxClassifier(predictor=model) if args.gpu >= 0: model.to_gpu() optimizer = chainer.optimizers.Adam(alpha=1e-5) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005)) model.predictor.upscore2.disable_update() model.predictor.upscore_pool4.disable_update() model.predictor.upscore8.disable_update() updater = training.StandardUpdater(iter_train, optimizer, device=args.gpu) trainer = training.Trainer( updater, stop_trigger=(args.max_iteration, 'iteration'), out=args.out) trainer.extend(contrib.extensions.ParamsReport(args.__dict__)) trainer.extend(extensions.snapshot_object( target=model.predictor, filename='model_{.updater.iteration:08}.npz'), trigger=(args.interval_eval, 'iteration')) trainer.extend(extensions.LogReport( trigger=(args.interval_print, 'iteration'))) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'elapsed_time', 'main/loss'])) assert extensions.PlotReport.available() trainer.extend(extensions.PlotReport( y_keys=['main/loss'], x_key='iteration', file_name='loss.png', trigger=(args.interval_print, 'iteration'))) trainer.extend( contrib.extensions.SemanticSegmentationVisReport( iter_test, transform=Transform(train=False), class_names=class_names, device=args.gpu, shape=(15, 5)), trigger=(args.interval_print, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=5)) trainer.run()
def train(args): """Train E2E-TTS model.""" set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) # reverse input and output dimension idim = int(valid_json[utts[0]]["output"][0]["shape"][1]) odim = int(valid_json[utts[0]]["input"][0]["shape"][1]) logging.info("#input dims : " + str(idim)) logging.info("#output dims: " + str(odim)) # get extra input and output dimenstion if args.use_speaker_embedding: args.spk_embed_dim = int(valid_json[utts[0]]["input"][1]["shape"][0]) else: args.spk_embed_dim = None if args.use_second_target: args.spc_dim = int(valid_json[utts[0]]["input"][1]["shape"][1]) else: args.spc_dim = None # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to" + model_conf) f.write( json.dumps( (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) # specify model architecture if args.enc_init is not None or args.dec_init is not None: model = load_trained_modules(idim, odim, args, TTSInterface) else: model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, TTSInterface) logging.info(model) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu))) if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu) ) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # freeze modules, if specified if args.freeze_mods: if hasattr(model, "module"): freeze_mods = ["module." + x for x in args.freeze_mods] else: freeze_mods = args.freeze_mods for mod, param in model.named_parameters(): if any(mod.startswith(key) for key in freeze_mods): logging.info(f"{mod} is frozen not to be updated.") param.requires_grad = False model_params = filter(lambda x: x.requires_grad, model.parameters()) else: model_params = model.parameters() logging.warning( "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), sum(p.numel() for p in model.parameters() if p.requires_grad) * 100.0 / sum(p.numel() for p in model.parameters()), ) ) # Setup an optimizer if args.opt == "adam": optimizer = torch.optim.Adam( model_params, args.lr, eps=args.eps, weight_decay=args.weight_decay ) elif args.opt == "noam": from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt( model_params, args.adim, args.transformer_warmup_steps, args.transformer_lr ) else: raise NotImplementedError("unknown optimizer: " + args.opt) # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # read json data with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 if use_sortagrad: args.batch_sort_key = "input" # make minibatch list (variable length) train_batchset = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, batch_sort_key=args.batch_sort_key, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, swap_io=True, iaxis=0, oaxis=0, ) valid_batchset = make_batchset( valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, batch_sort_key=args.batch_sort_key, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, swap_io=True, iaxis=0, oaxis=0, ) load_tr = LoadInputsAndTargets( mode="tts", use_speaker_embedding=args.use_speaker_embedding, use_second_target=args.use_second_target, preprocess_conf=args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing keep_all_data_on_mem=args.keep_all_data_on_mem, ) load_cv = LoadInputsAndTargets( mode="tts", use_speaker_embedding=args.use_speaker_embedding, use_second_target=args.use_second_target, preprocess_conf=args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing keep_all_data_on_mem=args.keep_all_data_on_mem, ) converter = CustomConverter() # hack to make batchsize argument as 1 # actual bathsize is included in a list train_iter = { "main": ChainerDataLoader( dataset=TransformDataset( train_batchset, lambda data: converter([load_tr(data)]) ), batch_size=1, num_workers=args.num_iter_processes, shuffle=not use_sortagrad, collate_fn=lambda x: x[0], ) } valid_iter = { "main": ChainerDataLoader( dataset=TransformDataset( valid_batchset, lambda data: converter([load_cv(data)]) ), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.num_iter_processes, ) } # Set up a trainer updater = CustomUpdater( model, args.grad_clip, train_iter, optimizer, device, args.accum_grad ) trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer) # set intervals eval_interval = (args.eval_interval_epochs, "epoch") save_interval = (args.save_interval_epochs, "epoch") report_interval = (args.report_interval_iters, "iteration") # Evaluate the model with the test dataset for each epoch trainer.extend( CustomEvaluator(model, valid_iter, reporter, device), trigger=eval_interval ) # Save snapshot for each epoch trainer.extend(torch_snapshot(), trigger=save_interval) # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger( "validation/main/loss", trigger=eval_interval ), ) # Save attention figure for each epoch if args.num_save_attention > 0: data = sorted( list(valid_json.items())[: args.num_save_attention], key=lambda x: int(x[1]["output"][0]["shape"][0]), ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class reduction_factor = model.module.reduction_factor else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class reduction_factor = model.reduction_factor if reduction_factor > 1: # fix the length to crop attention weight plot correctly data = copy.deepcopy(data) for idx in range(len(data)): ilen = data[idx][1]["input"][0]["shape"][0] data[idx][1]["input"][0]["shape"][0] = ilen // reduction_factor att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, reverse=True, ) trainer.extend(att_reporter, trigger=eval_interval) else: att_reporter = None # Make a plot for training and validation values if hasattr(model, "module"): base_plot_keys = model.module.base_plot_keys else: base_plot_keys = model.base_plot_keys plot_keys = [] for key in base_plot_keys: plot_key = ["main/" + key, "validation/main/" + key] trainer.extend( extensions.PlotReport(plot_key, "epoch", file_name=key + ".png"), trigger=eval_interval, ) plot_keys += plot_key trainer.extend( extensions.PlotReport(plot_keys, "epoch", file_name="all_loss.png"), trigger=eval_interval, ) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=report_interval)) report_keys = ["epoch", "iteration", "elapsed_time"] + plot_keys trainer.extend(extensions.PrintReport(report_keys), trigger=report_interval) trainer.extend(extensions.ProgressBar(), trigger=report_interval) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter(args.tensorboard_dir) trainer.extend(TensorboardLogger(writer, att_reporter), trigger=report_interval) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def train_model(): archs = { 'mymodel': mymodel.MyModel, 'nin': nin.NIN, 'alex': alex.Alex, 'lenet': lenet5.Lenet5, 'vgg': vgg16.VGG16, 'googlenet': googlenet.GoogLeNet, 'deepface': deepface.DeepFace } parser = argparse.ArgumentParser( description='Training convnet from dataset (only 3 channels image)') parser.add_argument('train', help='Path to training image-label list file') parser.add_argument('test', help='Path to test image-label list file') parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin', help='Convnet architecture') parser.add_argument('--epoch', '-E', type=int, default=10, help='Number of epochs to train') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Training minibatch size') parser.add_argument('--test_batchsize', '-b', type=int, default=250, help='Test minibatch size') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--root', '-R', default='.', help='Root directory path of image files') parser.add_argument('--mean', '-m', default='mean.npy', help='Mean file (computed by compute_mean.py)') parser.add_argument('--out', '-o', default='result', help='Output directory') args = parser.parse_args() print 'GPU: {}'.format(args.gpu) print '# Minibatch-size: {}'.format(args.batchsize) print '# epoch: {}'.format(args.epoch) print '' # Initialize model to train model = archs[args.arch]() if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Load datasets mean = np.load(args.mean) train = PreprocessedDataset(args.train, args.root, mean, model.insize) test = PreprocessedDataset(args.test, args.root, mean, model.insize) # Set up iterator train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.test_batchsize, repeat=False, shuffle=False) # Set up optimizer optimizer = chainer.optimizers.AdaDelta() optimizer.setup(model) # Set up trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out) # Copy chain with shared parameters to flip 'train' flag only in test eval_model = model.copy() eval_model.train = False trainer.extend(extensions.Evaluator(test_iter, eval_model, device=args.gpu)) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) # Run trainer date = datetime.datetime.today() start_time = time.clock() trainer.run() total_time = datetime.timedelta(seconds=time.clock() - start_time) # Save trained model print '' print 'Training has been finished.' print 'Total training time: {}.'.format(total_time) print 'Saving the trained model...', chainer.serializers.save_npz( os.path.join(args.out, 'model_final_' + args.arch), model) print '----> done' info = open(os.path.join(args.out, 'info'), 'a') info.write('Date: {}.\n'.format(date.strftime("%Y/%m/%d %H:%M:%S"))) info.write('----> Total training time: {}.'.format(total_time))
def main(): parser = argparse.ArgumentParser(description='ChainerMN example: VGG16') parser.add_argument('--dataset', '-d', default='cifar10', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--learnrate', '-l', type=float, default=0.05, help='Learning rate for SGD') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', '-g', action='store_true', default=False, help='use GPU') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') args = parser.parse_args() # Create ChainerMN communicator. if args.gpu: comm = chainermn.create_communicator('hierarchical') device = comm.rank else: comm = chainermn.create_communicator('naive') device = -1 if comm.rank == 0: print('GPU: {}'.format(args.gpu)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Load the CIFAR10 dataset if args.dataset == 'cifar10': class_labels = 10 train, test = chainer.datasets.get_cifar10() elif args.dataset == 'cifar100': class_labels = 100 train, test = chainer.datasets.get_cifar100() else: raise RuntimeError('Invalid dataset choice.') model = L.Classifier(VGG.VGG(comm, class_labels)) if args.gpu: # Make a specified GPU current chainer.cuda.get_device_from_id(device).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.MomentumSGD(args.learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) if comm.rank != 0: train = chainermn.datasets.create_empty_dataset(train) test = chainermn.datasets.create_empty_dataset(test) train_iter = chainermn.iterators.create_multi_node_iterator( chainer.iterators.SerialIterator(train, args.batchsize), comm) test_iter = chainermn.iterators.create_multi_node_iterator( chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False), comm) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=device)) if comm.rank == 0: # Dump a computational graph from 'loss' variable # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.DumpGraph('main/loss')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Save two plot images to the result dir if args.plot and extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) # Run the training trainer.run()
def train(args): '''Run training''' # seed setting torch.manual_seed(args.seed) # debug mode setting # 0 would be fastest, but 1 seems to be reasonable # by considering reproducability # revmoe type check if args.debugmode < 2: chainer.config.type_check = False logging.info('torch type check is disabled') # use determinisitic computation or not if args.debugmode < 1: torch.backends.cudnn.deterministic = False logging.info('torch cudnn deterministic is disabled') else: torch.backends.cudnn.deterministic = True # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]['input'][0]['shape'][1]) odim = int(valid_json[utts[0]]['output'][0]['shape'][1]) logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = 'ctc' logging.info('Pure CTC mode') elif args.mtlalpha == 0.0: mtl_mode = 'att' logging.info('Pure attention mode') else: mtl_mode = 'mtl' logging.info('Multitask learning mode') # specify model architecture e2e = E2E(idim, odim, args) model = Loss(e2e, args.mtlalpha) if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch.load(args.rnnlm, rnnlm) e2e.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu))) logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) # Setup an optimizer if args.opt == 'adadelta': optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95, eps=args.eps) elif args.opt == 'adam': optimizer = torch.optim.Adam(model.parameters()) # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter(e2e.subsample[0]) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] # make minibatch list (variable length) train = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1) valid = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1) # hack to make batchsze argument as 1 # actual bathsize is included in a list if args.n_iter_processes > 0: train_iter = chainer.iterators.MultiprocessIterator( TransformDataset(train, converter.transform), batch_size=1, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20) valid_iter = chainer.iterators.MultiprocessIterator( TransformDataset(valid, converter.transform), batch_size=1, repeat=False, shuffle=False, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20) else: train_iter = chainer.iterators.SerialIterator(TransformDataset( train, converter.transform), batch_size=1) valid_iter = chainer.iterators.SerialIterator(TransformDataset( valid, converter.transform), batch_size=1, repeat=False, shuffle=False) # Set up a trainer updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, converter, device, args.ngpu) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend( CustomEvaluator(model, valid_iter, reporter, converter, device)) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) if hasattr(model, "module"): att_vis_fn = model.module.predictor.calculate_all_attentions else: att_vis_fn = model.predictor.calculate_all_attentions trainer.extend(PlotAttentionReport(att_vis_fn, data, args.outdir + "/att_ws", converter=converter, device=device), trigger=(1, 'epoch')) # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/loss_ctc', 'validation/main/loss_ctc', 'main/loss_att', 'validation/main/loss_att' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/acc', 'validation/main/acc'], 'epoch', file_name='acc.png')) # Save best models trainer.extend( extensions.snapshot_object(model, 'model.loss.best', savefun=torch_save), trigger=training.triggers.MinValueTrigger('validation/main/loss')) if mtl_mode is not 'ctc': trainer.extend( extensions.snapshot_object(model, 'model.acc.best', savefun=torch_save), trigger=training.triggers.MaxValueTrigger('validation/main/acc')) # save snapshot which contains model and optimizer states trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # epsilon decay in the optimizer if args.opt == 'adadelta': if args.criterion == 'acc' and mtl_mode is not 'ctc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(REPORT_INTERVAL, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'main/loss', 'main/loss_ctc', 'main/loss_att', 'validation/main/loss', 'validation/main/loss_ctc', 'validation/main/loss_att', 'main/acc', 'validation/main/acc', 'elapsed_time' ] if args.opt == 'adadelta': trainer.extend(extensions.observe_value( 'eps', lambda trainer: trainer.updater.get_optimizer('main'). param_groups[0]["eps"]), trigger=(REPORT_INTERVAL, 'iteration')) report_keys.append('eps') if args.report_cer: report_keys.append('validation/main/cer') if args.report_wer: report_keys.append('validation/main/wer') trainer.extend(extensions.PrintReport(report_keys), trigger=(REPORT_INTERVAL, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=REPORT_INTERVAL)) # Run the training trainer.run()
def main(): # Introduce argparse for clarity and organization. # Starting to use higher capacity models, thus set up for GPU. parser = argparse.ArgumentParser(description='Chainer-Tutorial: MLP') parser.add_argument('--batch_size', '-b', type=int, default=128, help='Number of samples in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of times to train on data set') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID: -1 indicates CPU') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') args = parser.parse_args() # Load mnist data # http://docs.chainer.org/en/latest/reference/datasets.html train, test = chainer.datasets.get_mnist() # Define iterators. train_iter = chainer.iterators.SerialIterator(train, args.batch_size) test_iter = chainer.iterators.SerialIterator(test, args.batch_size, repeat=False, shuffle=False) # Initialize model: Loss function defaults to softmax_cross_entropy. # 784 is dimension of the inputs, 625 is n_units in hidden layer # and 10 is the output dimension. model = L.Classifier(ModernMLP(625, 10)) # Set up GPU usage if necessary. args.gpu is a condition as well as an # identification when passed to get_device(). if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu() # Define optimizer (SGD, Adam, RMSprop, etc) # http://docs.chainer.org/en/latest/reference/optimizers.html # RMSprop default parameter setting: # lr=0.01, alpha=0.99, eps=1e-8 optimizer = chainer.optimizers.RMSprop() optimizer.setup(model) # Set up trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch')) # Evaluate the model at end of each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # Helper functions (extensions) to monitor progress on stdout. report_params = [ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ] trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport(report_params)) trainer.extend(extensions.ProgressBar()) # Here we add a bit more boiler plate code to help in output of useful # information in related to training. Very intuitive and great for post # analysis. # source: # https://github.com/pfnet/chainer/blob/master/examples/mnist/train_mnist.py # Take a snapshot for each specified epoch frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Save two plot images to the result dir if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport( ['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) if args.resume: # Resume from a snapshot (NumPy NPZ format and HDF5 format available) # http://docs.chainer.org/en/latest/reference/serializers.html chainer.serializers.load_npz(args.resume, trainer) # Run trainer trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--out', type=str, default='result', help='Output directory') parser.add_argument('--resume', '-r', type=str, help='Resume the training from snapshot') parser.add_argument('--mscoco-root', type=str, default='data', help='MSOCO dataset root directory') parser.add_argument('--max-iters', type=int, default=50000, help='Maximum number of iterations to train') parser.add_argument('--batch-size', type=int, default=128, help='Minibatch size') parser.add_argument('--dropout-ratio', type=float, default=0.5, help='Language model dropout ratio') parser.add_argument('--val-keep-quantity', type=int, default=100, help='Keep every N-th validation image') parser.add_argument('--val-iter', type=int, default=100, help='Run validation every N-th iteration') parser.add_argument('--log-iter', type=int, default=1, help='Log every N-th iteration') parser.add_argument('--snapshot-iter', type=int, default=1000, help='Model snapshot every N-th iteration') parser.add_argument('--rnn', type=str, default='nsteplstm', choices=['nsteplstm', 'lstm'], help='Language model layer type') parser.add_argument('--gpu', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--max-caption-length', type=int, default=30, help='Maxium caption length when using LSTM layer') args = parser.parse_args() # Load the MSCOCO dataset. Assumes that the dataset has been downloaded # already using e.g. the `download.py` script train, val = datasets.get_mscoco(args.mscoco_root) # Validation samples are used to address overfitting and see how well your # model generalizes to yet unseen data. However, since the number of these # samples in MSCOCO is quite large (~200k) and thus require time to # evaluate, you may choose to use only a fraction of the available samples val = val[::args.val_keep_quantity] # Number of unique words that are found in the dataset vocab_size = len(train.vocab) # Instantiate the model to be trained either with LSTM layers or with # NStepLSTM layers model = ImageCaptionModel( vocab_size, dropout_ratio=args.dropout_ratio, rnn=args.rnn) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() def transform(in_data): # Called for each sample and applies necessary preprocessing to the # image such as resizing and normalizing img, caption = in_data img = model.prepare(img) return img, caption # We need to preprocess the images since their sizes may vary (and the # model requires that they have the exact same fixed size) train = TransformDataset(train, transform) val = TransformDataset(val, transform) train_iter = iterators.MultiprocessIterator( train, args.batch_size, shared_mem=700000) val_iter = chainer.iterators.MultiprocessIterator( val, args.batch_size, repeat=False, shuffle=False, shared_mem=700000) optimizer = optimizers.Adam() optimizer.setup(model) def converter(batch, device): # The converted receives a batch of input samples any may modify it if # necessary. In our case, we need to align the captions depending on if # we are using LSTM layers of NStepLSTM layers in the model. if args.rnn == 'lstm': max_caption_length = args.max_caption_length elif args.rnn == 'nsteplstm': max_caption_length = None else: raise ValueError('Invalid RNN type.') return datasets.converter( batch, device, max_caption_length=max_caption_length) updater = training.updater.StandardUpdater( train_iter, optimizer=optimizer, device=args.gpu, converter=converter) trainer = training.Trainer( updater, out=args.out, stop_trigger=(args.max_iters, 'iteration')) trainer.extend( extensions.Evaluator( val_iter, target=model, converter=converter, device=args.gpu ), trigger=(args.val_iter, 'iteration') ) trainer.extend( extensions.LogReport( ['main/loss', 'validation/main/loss'], trigger=(args.log_iter, 'iteration') ) ) trainer.extend( extensions.PlotReport( ['main/loss', 'validation/main/loss'], trigger=(args.log_iter, 'iteration') ) ) trainer.extend( extensions.PrintReport( ['elapsed_time', 'epoch', 'iteration', 'main/loss', 'validation/main/loss'] ), trigger=(args.log_iter, 'iteration') ) # Save model snapshots so that later on, we can load them and generate new # captions for any image. This can be done in the `predict.py` script trainer.extend( extensions.snapshot(filename='snapshot_{.updater.iteration}'), trigger=(args.snapshot_iter, 'iteration') ) trainer.extend( extensions.snapshot_object(model, 'model_{.updater.iteration}'), trigger=(args.snapshot_iter, 'iteration') ) trainer.extend(extensions.ProgressBar()) if args.resume is not None: chainer.serializers.load_npz(args.resume, trainer) trainer.run()
def main(): ''' main function, start point ''' # 引数関連 parser = argparse.ArgumentParser() parser.add_argument('--batchsize', '-b', type=int, default=128, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.001, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU1 ID (negative value indicates CPU)') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--iter_parallel', '-p', action='store_true', default=False, help='loading dataset from disk') parser.add_argument('--test', action='store_true', default=False, help='Test Mode, a few dataset') parser.add_argument('--opt' , '-o', type=str, choices=('adam', 'sgd') ,default='adam') parser.add_argument('--fsize' , '-f', type=int ,default=5) parser.add_argument('--ch' , '-c', type=int ,default=4) parser.add_argument('--decay' , '-d', type=str ,default='exp', choices=('exp', 'lin')) parser.add_argument('--weight', '-w', type=float ,default=1.0) args = parser.parse_args() # parameter出力 print("-=Learning Parameter=-") print("# Max Epochs: {}".format(args.epoch)) print("# Batch Size: {}".format(args.batchsize)) print("# Learning Rate: {}".format(args.learnrate)) print("# Optimizer Method: {}".format(args.opt)) print("# Filter Size: {}".format(args.fsize)) print("# Channel Scale: {}".format(args.ch)) print("# coef. decay : {}".format(args.decay)) print("# contloss' weight : {}".format(args.weight)) print('# Train Dataet: General 100') if args.iter_parallel: print("# Data Iters that loads in Parallel") print("\n") # 保存ディレクトリ # save didrectory model_dir_name = 'CAEFINet_opt_{}_ch_{}_fsize_{}_decay_{}_weight_{}'.format(args.opt, args.ch, args.fsize, args.decay, args.weight) outdir = path.join(ROOT_PATH, 'results','FI' ,'CAEFINet', model_dir_name) if not path.exists(outdir): os.makedirs(outdir) with open(path.join(outdir, 'arg_param.txt'), 'w') as f: for k, v in args.__dict__.items(): f.write('{}:{}\n'.format(k, v)) #loading dataset if args.test: print('# loading test dataet(UCF101_minimam_test_size64_frame3_group2_max4_p) ...') train_dataset = 'UCF101_minimam_test_size64_frame3_group2_max4_p' test_dataset = 'UCF101_minimam_test_size64_frame3_group2_max4_p' else: print('# loading test dataet(UCF101_train_size64_frame3_group10_max100_p, UCF101_test_size64_frame3_group25_max5_p) ...') train_dataset = 'UCF101_train_size64_frame3_group10_max100_p' test_dataset = 'UCF101_test_size64_frame3_group25_max5_p' if args.iter_parallel: train = ds.SequenceDataset(dataset=train_dataset) test = ds.SequenceDataset(dataset=test_dataset) else: train = ds.SequenceDatasetOnMem(dataset=train_dataset) test = ds.SequenceDatasetOnMem(dataset=test_dataset) # prepare model model = N.CAEFINet(vgg_path=path.join(ROOT_PATH, 'models', 'VGG16.npz'), f_size=args.fsize, n_ch=args.ch, size=64) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # setup optimizer if args.opt == 'adam': optimizer = chainer.optimizers.Adam(alpha=args.learnrate) elif args.opt == 'sgd': optimizer = chainer.optimizers.MomentumSGD(lr=args.learnrate, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001)) # setup iter if args.iter_parallel: train_iter = chainer.iterators.MultiprocessIterator( train, args.batchsize, n_processes=8) test_iter = chainer.iterators.MultiprocessIterator( test, args.batchsize, repeat=False, shuffle=False, n_processes=8) else: train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator( test, args.batchsize, repeat=False, shuffle=False) # setup trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, loss_func=model.get_loss_func(weight=args.weight, coef_decay=args.decay)) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=outdir) # # eval test data trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu, eval_func=model.get_loss_func(weight=args.weight, coef_decay=args.decay))) # dump loss graph trainer.extend(extensions.dump_graph('main/loss')) # lr shift if args.opt == 'sgd': trainer.extend(extensions.ExponentialShift("lr", 0.1), trigger=(50, 'epoch')) elif args.opt == 'adam': trainer.extend(extensions.ExponentialShift("alpha", 0.1), trigger=(50, 'epoch')) # save snapshot trainer.extend(extensions.snapshot(), trigger=(10, 'epoch')) trainer.extend(extensions.snapshot_object( model, 'model_snapshot_{.updater.epoch}'), trigger=(10, 'epoch')) # log report trainer.extend(extensions.LogReport()) trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch')) # plot loss graph trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'], 'epoch', file_name='mse_loss.png')) trainer.extend( extensions.PlotReport(['main/cont_loss', 'validation/main/cont_loss'], 'epoch', file_name='cont_loss.png')) # plot acc graph trainer.extend(extensions.PlotReport(['main/psnr', 'validation/main/psnr'], 'epoch', file_name='PSNR.png')) # print info trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss','main/mse_loss', 'validation/main/mse_loss', 'main/cont_loss', 'validation/main/cont_loss', 'main/psnr', 'validation/main/psnr', 'lr', 'elapsed_time'])) # print progbar trainer.extend(extensions.ProgressBar()) # [ChainerUI] enable to send commands from ChainerUI trainer.extend(CommandsExtension()) # [ChainerUI] save 'args' to show experimental conditions save_args(args, outdir) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) trainer.run() # save final model model_outdir = path.join(ROOT_PATH, 'models', model_dir_name) if not path.exists(model_outdir): os.makedirs(model_outdir) model_name = 'CAEFINet_{}_ch_{}_fsize_{}_decay_{}_weight_{}.npz'.format(args.opt, args.ch, args.fsize, args.decay, args.weight) chainer.serializers.save_npz(path.join(model_outdir, model_name), model) model_parameter = { 'name': 'CAEFINetConcat', 'parameter': {'f_size':args.fsize, 'ch':args.ch} } with open(path.join(model_outdir, 'model_parameter.json'), 'w') as f: json.dump(model_parameter, f)
def main(): parser = argparse.ArgumentParser( description='ChainerCV training example: Faster R-CNN') parser.add_argument('--dataset', choices=('voc07', 'voc0712'), help='The dataset to use: VOC07, VOC07+12', default='voc07') parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--lr', '-l', type=float, default=1e-3) parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--step_size', '-ss', type=int, default=50000) parser.add_argument('--iteration', '-i', type=int, default=70000) args = parser.parse_args() np.random.seed(args.seed) if args.dataset == 'voc07': train_data = VOCBboxDataset(split='trainval', year='2007') elif args.dataset == 'voc0712': train_data = ConcatenatedDataset( VOCBboxDataset(year='2007', split='trainval'), VOCBboxDataset(year='2012', split='trainval')) test_data = VOCBboxDataset(split='test', year='2007', use_difficult=True, return_difficult=True) faster_rcnn = FasterRCNNVGG16(n_fg_class=len(voc_bbox_label_names), pretrained_model='imagenet') faster_rcnn.use_preset('evaluate') model = FasterRCNNTrainChain(faster_rcnn) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005)) train_data = TransformDataset(train_data, Transform(faster_rcnn)) train_iter = chainer.iterators.MultiprocessIterator(train_data, batch_size=1, n_processes=None, shared_mem=100000000) test_iter = chainer.iterators.SerialIterator(test_data, batch_size=1, repeat=False, shuffle=False) updater = chainer.training.updater.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.iteration, 'iteration'), out=args.out) trainer.extend(extensions.snapshot_object(model.faster_rcnn, 'snapshot_model.npz'), trigger=(args.iteration, 'iteration')) trainer.extend(extensions.ExponentialShift('lr', 0.1), trigger=(args.step_size, 'iteration')) log_interval = 20, 'iteration' plot_interval = 3000, 'iteration' print_interval = 20, 'iteration' trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss', 'validation/main/map', ]), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if extensions.PlotReport.available(): trainer.extend(extensions.PlotReport(['main/loss'], file_name='loss.png', trigger=plot_interval), trigger=plot_interval) trainer.extend(DetectionVOCEvaluator(test_iter, model.faster_rcnn, use_07_metric=True, label_names=voc_bbox_label_names), trigger=ManualScheduleTrigger( [args.step_size, args.iteration], 'iteration')) trainer.extend(extensions.dump_graph('main/loss')) trainer.run()
def get_trainer(args): config = yaml.load(open(args.config)) # Set workspace size if 'max_workspace_size' in config: chainer.cuda.set_max_workspace_size(config['max_workspace_size']) # Prepare ChainerMN communicator if args.gpu: if args.communicator == 'naive': print("Error: 'naive' communicator does not support GPU.\n") exit(-1) comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank else: if args.communicator != 'naive': print('Warning: using naive communicator ' 'because only naive supports CPU-only execution') comm = chainermn.create_communicator('naive') device = -1 # Show the setup information if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size())) if args.gpu: print('Using GPUs - max workspace size:', chainer.cuda.get_max_workspace_size()) print('Using {} communicator'.format(args.communicator)) # Output version info if comm.rank == 0: print('Chainer version: {}'.format(chainer.__version__)) print('ChainerMN version: {}'.format(chainermn.__version__)) print('cuda: {}, cudnn: {}'.format(chainer.cuda.available, chainer.cuda.cudnn_enabled)) # Create result_dir if args.result_dir is not None: config['result_dir'] = args.result_dir model_fn = config['model']['module'].split('.')[-1] sys.path.insert(0, args.result_dir) config['model']['module'] = model_fn else: config['result_dir'] = create_result_dir_from_config_path(args.config) log_fn = save_config_get_log_fn(config['result_dir'], args.config) if comm.rank == 0: print('result_dir:', config['result_dir']) # Instantiate model model = get_model_from_config(config, comm) if args.gpu: chainer.cuda.get_device(device).use() model.to_gpu() if comm.rank == 0: print('model:', model.__class__.__name__) # Initialize optimizer optimizer = get_optimizer_from_config(model, config) optimizer = chainermn.create_multi_node_optimizer(optimizer, comm) if comm.rank == 0: print('optimizer:', optimizer.__class__.__name__) # Setting up datasets if comm.rank == 0: train_dataset, valid_dataset = get_dataset_from_config(config) print('train_dataset: {}'.format(len(train_dataset)), train_dataset.__class__.__name__) print('valid_dataset: {}'.format(len(valid_dataset)), valid_dataset.__class__.__name__) else: train_dataset, valid_dataset = [], [] train_dataset = chainermn.scatter_dataset(train_dataset, comm) valid_dataset = chainermn.scatter_dataset(valid_dataset, comm) # Create iterators # multiprocessing.set_start_method('forkserver') train_iter, valid_iter = create_iterators(train_dataset, valid_dataset, config) if comm.rank == 0: print('train_iter:', train_iter.__class__.__name__) print('valid_iter:', valid_iter.__class__.__name__) # Create updater and trainer if 'updater_creator' in config: updater_creator = get_updater_creator_from_config(config) updater = updater_creator(train_iter, optimizer, device=device) else: updater = create_updater(train_iter, optimizer, device=device) if comm.rank == 0: print('updater:', updater.__class__.__name__) # Create Trainer trainer = training.Trainer(updater, config['stop_trigger'], out=config['result_dir']) if comm.rank == 0: print('Trainer stops:', config['stop_trigger']) # Trainer extensions for ext in config['trainer_extension']: ext, values = ext.popitem() if ext == 'LogReport' and comm.rank == 0: trigger = values['trigger'] trainer.extend( extensions.LogReport(trigger=trigger, log_name=log_fn)) elif ext == 'observe_lr' and comm.rank == 0: trainer.extend(extensions.observe_lr(), trigger=values['trigger']) elif ext == 'dump_graph' and comm.rank == 0: trainer.extend(extensions.dump_graph(**values)) elif ext == 'Evaluator': assert 'module' in values mod = import_module(values['module']) evaluator = getattr(mod, values['name']) if evaluator is extensions.Evaluator: evaluator = evaluator(valid_iter, model, device=device) else: evaluator = evaluator(valid_iter, model.predictor) evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) trainer.extend(evaluator, trigger=values['trigger'], name=values['prefix']) elif ext == 'PlotReport' and comm.rank == 0: trainer.extend(extensions.PlotReport(**values)) elif ext == 'PrintReport' and comm.rank == 0: trigger = values.pop('trigger') trainer.extend(extensions.PrintReport(**values), trigger=trigger) elif ext == 'ProgressBar' and comm.rank == 0: upd_int = values['update_interval'] trigger = values['trigger'] trainer.extend(extensions.ProgressBar(update_interval=upd_int), trigger=trigger) elif ext == 'snapshot' and comm.rank == 0: filename = values['filename'] trigger = values['trigger'] trainer.extend(extensions.snapshot(filename=filename), trigger=trigger) # LR decay if 'lr_drop_ratio' in config['optimizer'] \ and 'lr_drop_triggers' in config['optimizer']: ratio = config['optimizer']['lr_drop_ratio'] points = config['optimizer']['lr_drop_triggers']['points'] unit = config['optimizer']['lr_drop_triggers']['unit'] drop_trigger = triggers.ManualScheduleTrigger(points, unit) def lr_drop(trainer): trainer.updater.get_optimizer('main').lr *= ratio trainer.extend(lr_drop, trigger=drop_trigger) if 'lr_drop_poly_power' in config['optimizer']: power = config['optimizer']['lr_drop_poly_power'] stop_trigger = config['stop_trigger'] batchsize = train_iter.batch_size len_dataset = len(train_dataset) trainer.extend(PolynomialShift('lr', power, stop_trigger, batchsize, len_dataset), trigger=(1, 'iteration')) # Resume if args.resume is not None: # fn = '{}.bak'.format(args.resume) # shutil.copy(args.resume, fn) serializers.load_npz(args.resume, trainer) if comm.rank == 0: print('Resumed from:', args.resume) if comm.rank == 0: print('==========================================') return trainer
# Extensions snapshot_interval = (args.snapshot_interval, 'iteration') display_interval = (args.display_interval, 'iteration') trainer.extend(extensions.Evaluator(valid_iter, model, device=args.gpu)) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=snapshot_interval) trainer.extend(extensions.LogReport(trigger=display_interval)) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy' ]), trigger=display_interval) trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'iteration', file_name='loss.png', trigger=display_interval)) trainer.extend( extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], 'iteration', file_name='accuracy.png', trigger=display_interval)) trainer.extend(extensions.ProgressBar(update_interval=10)) # Resume if args.resume: chainer.serializers.load_npz(args.resume, trainer) # Run trainer.run()
def main(): parser = argparse.ArgumentParser(description='ColumnNet') parser.add_argument('--batchsize', '-B', type=int, default=32, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=200, help='Number of sweeps over the dataset to train') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') parser.add_argument('--loaderjob', '-j', type=int, help='Number of parallel data loading processes') parser.add_argument('--val_batchsize', '-b', type=int, default=250, help='Validation minibatch size') parser.add_argument('--test', action='store_true') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train Model = ColumnNet() model = L.Classifier(Model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Load the ColumnNet dataset f = open('train_list.txt') train_lines = f.readlines() f.close() f = open('val_list.txt') val_lines = f.readlines() f.close() #dataset = LabeledImageDataset(list(zip(fnames, labels))) #transform_dataset = TransformDataset(dataset, transform) #train, val = datasets.split_dataset_random(transform_dataset, int(len(dataset) * 0.8), seed=0) train = load_dataset(train_lines) val = load_dataset(val_lines) train_iter = iterators.MultiprocessIterator(train, args.batchsize) val_iter = chainer.iterators.MultiprocessIterator( val, args.val_batchsize, repeat=False, shuffle=False) if args.test: val_interval = 5, 'epoch' log_interval = 1, 'epoch' else: val_interval = 100000, 'iteration' log_interval = 1000, 'iteration' # Set up an optimizer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out='result') # Set up a trainer trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu),trigger=val_interval) trainer.extend(extensions.snapshot(), trigger=(1, 'epoch')) trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}'), trigger=val_interval) # Be careful to pass the interval directly to LogReport # (it determines when to emit log rather than when to read observations) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'validation/main/map', 'lr' ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/map'], x_key='epoch', file_name='loss.png')) trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/map'], x_key='epoch', file_name='accuracy.png')) trainer.extend(extensions.dump_graph('main/loss')) # Run the training trainer.run() chainer.serializers.save_npz('result/columnnet.model', Model)
def main(): parser = argparse.ArgumentParser(description='Chainer CIFAR example:') parser.add_argument('--dataset', '-d', default='cifar10', help='The dataset to use: cifar10 or cifar100') parser.add_argument('--batchsize', '-b', type=int, default=256, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.05, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=300, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--model', '-m', default='resnet', help='using model name') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train. # Classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. if args.dataset == 'cifar10': print('Using CIFAR10 dataset.') class_labels = 10 train, test = get_cifar10() elif args.dataset == 'cifar100': print('Using CIFAR100 dataset.') class_labels = 100 train, test = get_cifar100() else: raise RuntimeError('Invalid dataset choice.') train = AugmentedDataset(train) if args.model == 'resnet': model = models.resnet_shift.ResNet(False, class_labels) if args.model == 'shift': model = models.resnet_shift.ResNet(True, class_labels) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU optimizer = chainer.optimizers.MomentumSGD(args.learnrate) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4)) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # snapshot model trainer.extend(extensions.snapshot_object( model, filename='model_epoch-{.updater.epoch}'), trigger=(10, 'epoch')) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) # Reduce the learning rate by half every 25 epochs. trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch')) # Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # Take a snapshot at each epoch trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend( extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], x_key='epoch', file_name='loss.png')) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run()
def train(mode): Dt1_train_dir = "/media/wutong/New Volume/reseach/Dataset/OU-ISIR_by_Setoguchi/Gallery/signed/128_3ch/CV01_(Gallery&Probe)_2nd" train1 = load_GEI(path_dir=Dt1_train_dir, mode=True) Dt2_train_dir = "/media/wutong/New Volume/reseach/Dataset/OU-ISIR_by_Setoguchi/Gallery/signed/128_3ch/CV01_Dt2_(Gallery&Probe)" train2 = load_GEI(path_dir=Dt2_train_dir, mode=True) Dt3_train_dir = "/media/wutong/New Volume/reseach/Dataset/OU-ISIR_by_Setoguchi/Gallery/signed/128_3ch/CV01_Dt3_(Gallery&Probe)" train3 = load_GEI(path_dir=Dt3_train_dir, mode=True) Dt4_train_dir = "/media/wutong/New Volume/reseach/Dataset/OU-ISIR_by_Setoguchi/Gallery/signed/128_3ch/CV01_Dt4_(Gallery&Probe)" train4 = load_GEI(path_dir=Dt4_train_dir, mode=True) model = Multi_modal_GEINet() model.to_gpu() # train_iter = iterators.MultiprocessIterator(train, batch_size=239) Dt1_train_iter = iterators.SerialIterator(train1, batch_size=239, shuffle=False) Dt2_train_iter = iterators.SerialIterator(train2, batch_size=239, shuffle=False) Dt3_train_iter = iterators.SerialIterator(train3, batch_size=239, shuffle=False) Dt4_train_iter = iterators.SerialIterator(train4, batch_size=239, shuffle=False) # optimizer = chainer.optimizers.SGD(lr=0.02) optimizer = chainer.optimizers.MomentumSGD(lr=0.02, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(0.01)) # updater = training.ParallelUpdater(train_iter, optimizer, devices={'main': 0, 'second': 1}) updater = Multi_modal_Updater(model, Dt1_train_iter, Dt2_train_iter, Dt3_train_iter, Dt4_train_iter, optimizer, device=0) epoch = 6250 trainer = training.Trainer( updater, (epoch, 'epoch'), out='/home/wutong/Setoguchi/chainer_files/result') # trainer.extend(extensions.Evaluator(test_iter, model, device=0)) trainer.extend(extensions.ExponentialShift(attr='lr', rate=0.56234), trigger=(1250, 'epoch')) trainer.extend( extensions.LogReport(log_name='SFDEI_log', trigger=(20, "epoch"))) trainer.extend((extensions.snapshot_object( model, filename='model_shapshot_{.update.epoch}')), trigger=(1250, 'epoch')) trainer.extend(extensions.snapshot(), trigger=(1250, 'epoch')) trainer.extend(extensions.PrintReport(['epoch', 'accuracy', 'loss'])) # 'validation/main/accuracy']), # trigger=(1, "epoch")) trainer.extend( extensions.dump_graph(root_name="loss", out_name="multi_modal_3.dot")) trainer.extend(extensions.PlotReport(["loss"]), trigger=(50, 'epoch')) trainer.extend(extensions.ProgressBar()) if mode == True: # Run the trainer trainer.run() else: serializers.load_npz( "/home/wutong/Setoguchi/chainer_files/SFDEINet_multi_modal/SFDEINet_multi_modal_model", trainer) trainer.run() serializers.save_npz( "/home/wutong/Setoguchi/chainer_files/SFDEINet_multi_modal/SFDEINet_multi_modal_model", trainer) serializers.save_npz( "/home/wutong/Setoguchi/chainer_files/SFDEINet_multi_modal/SFDEINet_multi_modal_model", model)
def main(): parser = argparse.ArgumentParser(description='Chainer Darknet53 Train') parser.add_argument('--batchsize', '-b', type=int, default=8) parser.add_argument('--iteration', '-i', type=int, default=100000) parser.add_argument('--gpus', '-g', type=int, nargs='*', default=[]) parser.add_argument('--out', '-o', default='darknet53-voc-result') parser.add_argument('--seed', default=0) parser.add_argument('--display_interval', type=int, default=100) parser.add_argument('--snapshot_interval', type=int, default=100) parser.add_argument('--validation_size', type=int, default=2048) args = parser.parse_args() print('GPUs: {}'.format(args.gpus)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# iteration: {}'.format(args.iteration)) print('') random.seed(args.seed) np.random.seed(args.seed) darknet53 = Darknet53(20) model = L.Classifier(darknet53) device = -1 if len(args.gpus) > 0: device = args.gpus[0] cuda.cupy.random.seed(args.seed) cuda.get_device_from_id(args.gpus[0]).use() if len(args.gpus) == 1: model.to_gpu() optimizer = chainer.optimizers.MomentumSGD(lr=0.001) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0005), 'hook_decay') train = VOCBboxDataset(split='train') test = VOCBboxDataset(split='val') train = YOLOVOCDataset(train, classifier=True, jitter=0.2, hue=0.1, sat=.75, val=.75) test = YOLOVOCDataset(test, classifier=True, crop_size=(256, 256)) test = test[np.random.permutation(np.arange( len(test)))[:min(args.validation_size, len(test))]] train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) if len(args.gpus) <= 1: updater = training.StandardUpdater(train_iter, optimizer, device=device) else: devices = {'main': args.gpus[0]} for gpu in args.gpus[1:]: devices['gpu{}'.format(gpu)] = gpu updater = training.ParallelUpdater(train_iter, optimizer, devices=devices) trainer = training.Trainer(updater, (args.iteration, 'iteration'), out=args.out) display_interval = (args.display_interval, 'iteration') snapshot_interval = (args.snapshot_interval, 'iteration') trainer.extend(extensions.Evaluator(test_iter, model, device=device), trigger=display_interval) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport(trigger=display_interval)) if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'iteration', display_interval, file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'iteration', display_interval, file_name='accuracy.png')) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ]), trigger=display_interval) trainer.extend(extensions.ProgressBar(update_interval=5)) trainer.extend(extensions.snapshot_object(darknet53, 'darknet53_snapshot.npz'), trigger=training.triggers.MinValueTrigger( 'validation/main/loss', snapshot_interval)) trainer.extend(extensions.snapshot_object(darknet53, 'darknet53_final.npz'), trigger=snapshot_interval) trainer.extend(DarknetShift(optimizer, 'poly', args.iteration)) trainer.extend(CropSizeUpdater(train, [(4 + i) * 32 for i in range(0, 11)])) trainer.run()
def train(args): '''RUN TRAINING''' # seed setting torch.manual_seed(args.seed) # use determinisitic computation or not if args.debugmode < 1: torch.backends.cudnn.deterministic = False logging.info('torch cudnn deterministic is disabled') else: torch.backends.cudnn.deterministic = True # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) # reverse input and output dimension idim = int(valid_json[utts[0]]['output'][0]['shape'][1]) odim = int(valid_json[utts[0]]['input'][0]['shape'][1]) if args.use_speaker_embedding: args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0]) else: args.spk_embed_dim = None logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture tacotron2 = Tacotron2(idim, odim, args) logging.info(tacotron2) # check the use of multi-gpu if args.ngpu > 1: tacotron2 = torch.nn.DataParallel(tacotron2, device_ids=list(range(args.ngpu))) logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # define loss model = Tacotron2Loss(tacotron2, args.use_masking, args.bce_pos_weight, args.monotonic) reporter = model.reporter # Setup an optimizer optimizer = torch.optim.Adam(model.parameters(), args.lr, eps=args.eps, weight_decay=args.weight_decay) # FIXME: TOO DIRTY HACK setattr(optimizer, 'target', reporter) setattr(optimizer, 'serialize', lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter(True, args.use_speaker_embedding) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] # make minibatch list (variable length) train_batchset = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) valid_batchset = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) # hack to make batchsze argument as 1 # actual bathsize is included in a list train_iter = chainer.iterators.MultiprocessIterator(TransformDataset( train_batchset, converter.transform), batch_size=1, n_processes=2, n_prefetch=8) #, maxtasksperchild=20) valid_iter = chainer.iterators.MultiprocessIterator(TransformDataset( valid_batchset, converter.transform), batch_size=1, repeat=False, shuffle=False, n_processes=2, n_prefetch=8) #maxtasksperchild=20) # Set up a trainer updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, converter, device) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend( CustomEvaluator(model, valid_iter, reporter, converter, device)) # Save attention figure for each epoch if args.num_save_attention > 0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) if hasattr(tacotron2, "module"): att_vis_fn = tacotron2.module.calculate_all_attentions else: att_vis_fn = tacotron2.calculate_all_attentions trainer.extend(PlotAttentionReport(att_vis_fn, data, args.outdir + '/att_ws', converter=CustomConverter( False, args.use_speaker_embedding), device=device, reverse=True), trigger=(1, 'epoch')) # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/l1_loss', 'validation/main/l1_loss', 'main/mse_loss', 'validation/main/mse_loss', 'main/bce_loss', 'validation/main/bce_loss', 'main/monotonic_loss', 'validation/main/monotonic_loss' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/l1_loss', 'validation/main/l1_loss'], 'epoch', file_name='l1_loss.png')) trainer.extend( extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'], 'epoch', file_name='mse_loss.png')) trainer.extend( extensions.PlotReport(['main/bce_loss', 'validation/main/bce_loss'], 'epoch', file_name='bce_loss.png')) trainer.extend( extensions.PlotReport( ['main/monotonic_loss', 'validation/main/monotonic_loss'], 'epoch', file_name='monotonic_loss.png')) # Save snapshot for each epoch trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # Save best models trainer.extend( extensions.snapshot_object(tacotron2, 'model.loss.best', savefun=torch_save), trigger=training.triggers.MinValueTrigger('validation/main/loss')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(REPORT_INTERVAL, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'elapsed_time', 'main/loss', 'main/l1_loss', 'main/mse_loss', 'main/bce_loss', 'main/monotonic_loss', 'validation/main/loss', 'validation/main/l1_loss', 'validation/main/mse_loss', 'validation/main/bce_loss', 'validation/main/monotonic_loss' ] trainer.extend(extensions.PrintReport(report_keys), trigger=(REPORT_INTERVAL, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=REPORT_INTERVAL)) # Run the training trainer.run()
def main(): rospack = rospkg.RosPack() jsk_perception_datasets_path = osp.join(rospack.get_path('jsk_perception'), 'learning_datasets') parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-g', '--gpu', default=0, type=int, help='GPU id') parser.add_argument('-d', '--dataset_dir', default=osp.join(jsk_perception_datasets_path, 'human_size_mirror_dataset'), type=str, help='Path to root directory of dataset') parser.add_argument('-m', '--model', default='FCN8sDepthPredictionConcatFirst', type=str, help='Model class name') parser.add_argument('-b', '--batch_size', default=1, type=int, help='Batch size') parser.add_argument('-e', '--epoch', default=100, type=int, help='Training epoch') parser.add_argument('-o', '--out', type=str, default=None, help='Output directory') args = parser.parse_args() gpu = args.gpu out = args.out # 0. config timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') if out is None: out = osp.join(rospkg.get_ros_home(), 'learning_logs', timestamp) max_iter_epoch = args.epoch, 'epoch' progress_bar_update_interval = 10 # iteration print_interval = 100, 'iteration' log_interval = 100, 'iteration' test_interval = 5, 'epoch' save_interval = 5, 'epoch' # 1. dataset dataset_train = DepthPredictionDataset(args.dataset_dir, split='train', aug=True) dataset_valid = DepthPredictionDataset(args.dataset_dir, split='test', aug=False) dataset_train_transformed = TransformDataset(dataset_train, transform) dataset_valid_transformed = TransformDataset(dataset_valid, transform) iter_train = chainer.iterators.MultiprocessIterator( dataset_train_transformed, batch_size=args.batch_size, shared_mem=10**8) iter_valid = chainer.iterators.MultiprocessIterator( dataset_valid_transformed, batch_size=1, shared_mem=10**8, repeat=False, shuffle=False) # 2. model vgg = fcn.models.VGG16() vgg_path = vgg.download() chainer.serializers.load_npz(vgg_path, vgg) n_class = len(dataset_train.class_names) assert n_class == 2 if args.model == 'FCN8sDepthPredictionConcatFirst': model = FCN8sDepthPredictionConcatFirst(n_class=n_class, masking=True) else: print('Invalid model class.') exit(1) model.init_from_vgg16(vgg) if gpu >= 0: cuda.get_device_from_id(gpu).use() model.to_gpu() # 3. optimizer optimizer = chainer.optimizers.Adam(alpha=1.0e-5) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005)) updater = chainer.training.updater.StandardUpdater(iter_train, optimizer, device=gpu) trainer = chainer.training.Trainer(updater, max_iter_epoch, out=out) trainer.extend(extensions.ExponentialShift("alpha", 0.99997)) if not osp.isdir(out): os.makedirs(out) with open(osp.join(out, 'dataset.txt'), 'w') as f: f.write(dataset_train.__class__.__name__) with open(osp.join(out, 'model.txt'), 'w') as f: f.write(model.__class__.__name__) with open(osp.join(out, 'batch_size.txt'), 'w') as f: f.write(str(args.batch_size)) trainer.extend(extensions.snapshot_object( model, savefun=chainer.serializers.save_npz, filename='model_snapshot.npz'), trigger=chainer.training.triggers.MaxValueTrigger( 'validation/main/depth_acc<0.10', save_interval)) trainer.extend( extensions.dump_graph(root_name='main/loss', out_name='network_architecture.dot')) trainer.extend( extensions.LogReport(log_name='log.json', trigger=log_interval)) trainer.extend(extensions.PlotReport([ 'main/loss', 'validation/main/loss', ], file_name='loss_plot.png', x_key='epoch', trigger=(5, 'epoch')), trigger=(5, 'epoch')) trainer.extend(chainer.training.extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/seg_loss', 'main/reg_loss', 'main/miou', 'main/depth_acc<0.03', 'main/depth_acc<0.10', 'main/depth_acc<0.30', 'validation/main/miou', 'validation/main/depth_acc<0.03', 'validation/main/depth_acc<0.10', 'validation/main/depth_acc<0.30', ]), trigger=print_interval) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend( extensions.ProgressBar(update_interval=progress_bar_update_interval)) trainer.extend(extensions.Evaluator(iter_valid, model, device=gpu), trigger=test_interval) trainer.run()
# Dump a computational graph from 'loss' variable at the first iteration # The "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # Take a snapshot for each specified epoch trainer.extend(extensions.snapshot(), trigger=(args.epochs, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(log_name=None)) # Save two plot images to the result dir if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) # Print selected entries of the log to stdout # Here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # Entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
def train(args): config = yaml.load(open(args.config)) print('==========================================') # Set workspace size if 'max_workspace_size' in config: chainer.cuda.set_max_workspace_size(config['max_workspace_size']) # Output version info print('chainer version: {}'.format(chainer.__version__)) print('cuda: {}, cudnn: {}, nccl: {}'.format(chainer.cuda.available, chainer.cuda.cudnn_enabled, HAVE_NCCL)) # Create result_dir if args.result_dir is not None: config['result_dir'] = args.result_dir else: config['result_dir'] = create_result_dir_from_config_path(args.config) log_fn = save_config_get_log_fn(config['result_dir'], args.config) print('result_dir:', config['result_dir']) # Instantiate model model = get_model_from_config(config) print('model:', model.__class__.__name__) # Initialize optimizer optimizer = get_optimizer_from_config(model, config) print('optimizer:', optimizer.__class__.__name__) # Setting up datasets train_dataset, valid_dataset = get_dataset_from_config(config) print('train_dataset: {}'.format(len(train_dataset)), train_dataset.__class__.__name__) print('valid_dataset: {}'.format(len(valid_dataset)), valid_dataset.__class__.__name__) # Prepare devices devices = {'main': args.gpus[0]} for gid in args.gpus[1:]: devices['gpu{}'.format(gid)] = gid # Create iterators train_iter, valid_iter = create_iterators( train_dataset, config['dataset']['train']['batchsize'], valid_dataset, config['dataset']['valid']['batchsize'], devices) print('train_iter:', train_iter.__class__.__name__) print('valid_iter:', valid_iter.__class__.__name__) # Create updater updater_creator = get_updater_creator_from_config(config) updater = updater_creator(train_iter, optimizer, devices) print('updater:', updater.__class__.__name__) # Create trainer trainer = training.Trainer(updater, config['stop_trigger'], out=config['result_dir']) print('Trainer stops:', config['stop_trigger']) # Trainer extensions for ext in config['trainer_extension']: ext, values = ext.popitem() if ext == 'LogReport': trigger = values['trigger'] trainer.extend( extensions.LogReport(trigger=trigger, log_name=log_fn)) elif ext == 'observe_lr': trainer.extend(extensions.observe_lr(), trigger=values['trigger']) elif ext == 'dump_graph': trainer.extend(extensions.dump_graph(**values)) elif ext == 'Evaluator': evaluator_creator = get_evaluator_creator_from_config(values) evaluator = evaluator_creator(valid_iter, model, devices) trainer.extend(evaluator, trigger=values['trigger'], name=values['prefix']) elif ext == 'PlotReport': trainer.extend(extensions.PlotReport(**values)) elif ext == 'PrintReport': trigger = values.pop('trigger') trainer.extend(extensions.PrintReport(**values), trigger=trigger) elif ext == 'ProgressBar': upd_int = values['update_interval'] trigger = values['trigger'] trainer.extend(extensions.ProgressBar(update_interval=upd_int), trigger=trigger) elif ext == 'snapshot': filename = values['filename'] trigger = values['trigger'] trainer.extend(extensions.snapshot(filename=filename), trigger=trigger) elif ext == 'ParameterStatistics': links = [] for link_name in values.pop('links'): lns = [ln.strip() for ln in link_name.split('.') if ln.strip()] target = model.predictor for ln in lns: target = getattr(target, ln) links.append(target) trainer.extend(extensions.ParameterStatistics(links, **values)) elif ext == 'custom': custom_extension = get_custum_extension_from_config(values) trainer.extend(custom_extension, trigger=values['trigger']) # LR decay if 'lr_drop_ratio' in config['optimizer'] \ and 'lr_drop_triggers' in config['optimizer']: ratio = config['optimizer']['lr_drop_ratio'] points = config['optimizer']['lr_drop_triggers']['points'] unit = config['optimizer']['lr_drop_triggers']['unit'] drop_trigger = triggers.ManualScheduleTrigger(points, unit) def lr_drop(trainer): trainer.updater.get_optimizer('main').lr *= ratio trainer.extend(lr_drop, trigger=drop_trigger) # Resume if args.resume is not None: fn = '{}.bak'.format(args.resume) shutil.copy(args.resume, fn) serializers.load_npz(args.resume, trainer) print('Resumed from:', args.resume) print('==========================================') trainer.run() return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--batchsize', type=int, default=12) parser.add_argument('--class-weight', type=str, default='class_weight.npy') parser.add_argument('--out', type=str, default='result') args = parser.parse_args() # Triggers log_trigger = (50, 'iteration') validation_trigger = (2000, 'iteration') end_trigger = (16000, 'iteration') # Dataset train = CamVidDataset(split='train') train = TransformDataset(train, transform) val = CamVidDataset(split='val') # Iterator train_iter = iterators.MultiprocessIterator(train, args.batchsize) val_iter = iterators.MultiprocessIterator( val, args.batchsize, shuffle=False, repeat=False) # Model class_weight = np.load(args.class_weight) model = SegNetBasic(n_class=len(camvid_label_names)) model = PixelwiseSoftmaxClassifier( model, class_weight=class_weight) if args.gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Optimizer optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(rate=0.0005)) # Updater updater = training.updaters.StandardUpdater( train_iter, optimizer, device=args.gpu) # Trainer trainer = training.Trainer(updater, end_trigger, out=args.out) trainer.extend(extensions.LogReport(trigger=log_trigger)) trainer.extend(extensions.observe_lr(), trigger=log_trigger) trainer.extend(extensions.dump_graph('main/loss')) if extensions.PlotReport.available(): trainer.extend(extensions.PlotReport( ['main/loss'], x_key='iteration', file_name='loss.png')) trainer.extend(extensions.PlotReport( ['validation/main/miou'], x_key='iteration', file_name='miou.png')) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'elapsed_time', 'lr', 'main/loss', 'validation/main/miou', 'validation/main/mean_class_accuracy', 'validation/main/pixel_accuracy']), trigger=log_trigger) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend( SemanticSegmentationEvaluator( val_iter, model.predictor, camvid_label_names), trigger=validation_trigger) trainer.run() chainer.serializers.save_npz( os.path.join(args.out, 'snapshot_model.npz'), recalculate_bn_statistics(model.predictor, 24))
def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]["output"][1]["shape"][1]) odim = int(valid_json[utts[0]]["output"][0]["shape"][1]) logging.info("#input dims : " + str(idim)) logging.info("#output dims: " + str(odim)) # specify model architecture model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, MTInterface) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to " + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True).encode("utf_8")) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) logging.warning( "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), sum(p.numel() for p in model.parameters() if p.requires_grad) * 100.0 / sum(p.numel() for p in model.parameters()), )) # Setup an optimizer if args.opt == "adadelta": optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay) elif args.opt == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.opt == "noam": from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt( model.parameters(), args.adim, args.transformer_warmup_steps, args.transformer_lr, ) else: raise NotImplementedError("unknown optimizer: " + args.opt) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux") raise e if args.opt == "noam": model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype) else: model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype) use_apex = True else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter() # read json data with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, mt=True, iaxis=1, oaxis=0, ) valid = make_batchset( valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, mt=True, iaxis=1, oaxis=0, ) load_tr = LoadInputsAndTargets(mode="mt", load_output=True) load_cv = LoadInputsAndTargets(mode="mt", load_output=True) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list train_iter = ChainerDataLoader( dataset=TransformDataset(train, lambda data: converter([load_tr(data)])), batch_size=1, num_workers=args.n_iter_processes, shuffle=not use_sortagrad, collate_fn=lambda x: x[0], ) valid_iter = ChainerDataLoader( dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) # Set up a trainer updater = CustomUpdater( model, args.grad_clip, {"main": train_iter}, optimizer, device, args.ngpu, False, args.accum_grad, use_apex=use_apex, ) trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch if args.save_interval_iters > 0: trainer.extend( CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu), trigger=(args.save_interval_iters, "iteration"), ) else: trainer.extend( CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)) # Save attention weight each epoch if args.num_save_attention > 0: # NOTE: sort it by output lengths data = sorted( list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]["output"][0]["shape"][0]), reverse=True, ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, ikey="output", iaxis=1, ) trainer.extend(att_reporter, trigger=(1, "epoch")) else: att_reporter = None # Make a plot for training and validation values trainer.extend( extensions.PlotReport(["main/loss", "validation/main/loss"], "epoch", file_name="loss.png")) trainer.extend( extensions.PlotReport(["main/acc", "validation/main/acc"], "epoch", file_name="acc.png")) trainer.extend( extensions.PlotReport(["main/ppl", "validation/main/ppl"], "epoch", file_name="ppl.png")) trainer.extend( extensions.PlotReport(["main/bleu", "validation/main/bleu"], "epoch", file_name="bleu.png")) # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger("validation/main/loss"), ) trainer.extend( snapshot_object(model, "model.acc.best"), trigger=training.triggers.MaxValueTrigger("validation/main/acc"), ) # save snapshot which contains model and optimizer states if args.save_interval_iters > 0: trainer.extend( torch_snapshot(filename="snapshot.iter.{.updater.iteration}"), trigger=(args.save_interval_iters, "iteration"), ) else: trainer.extend(torch_snapshot(), trigger=(1, "epoch")) # epsilon decay in the optimizer if args.opt == "adadelta": if args.criterion == "acc": trainer.extend( restore_snapshot(model, args.outdir + "/model.acc.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot(model, args.outdir + "/model.loss.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) elif args.opt == "adam": if args.criterion == "acc": trainer.extend( restore_snapshot(model, args.outdir + "/model.acc.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adam_lr_decay(args.lr_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot(model, args.outdir + "/model.loss.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adam_lr_decay(args.lr_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))) report_keys = [ "epoch", "iteration", "main/loss", "validation/main/loss", "main/acc", "validation/main/acc", "main/ppl", "validation/main/ppl", "elapsed_time", ] if args.opt == "adadelta": trainer.extend( extensions.observe_value( "eps", lambda trainer: trainer.updater.get_optimizer("main"). param_groups[0]["eps"], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("eps") elif args.opt in ["adam", "noam"]: trainer.extend( extensions.observe_value( "lr", lambda trainer: trainer.updater.get_optimizer("main"). param_groups[0]["lr"], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("lr") if args.report_bleu: report_keys.append("main/bleu") report_keys.append("validation/main/bleu") trainer.extend( extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, "iteration"), ) trainer.extend( extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": trainer.extend( TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter), trigger=(args.report_interval_iters, "iteration"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def main(): parser = argparse.ArgumentParser(description='Chanier example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=32, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--epoch', '-e', type=int, default=40, help='Number of sweeps over the datasdt to train') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') dataset_gomi = [] preprocessing(dataset_gomi, "sample_images_can1/pic", 10, 1) preprocessing(dataset_gomi, "sample_images_can2/pic", 10, 1) preprocessing(dataset_gomi, "sample_images_can3/pic", 20, 1) preprocessing(dataset_gomi, "sample_images_can4/pic", 10, 1) preprocessing(dataset_gomi, "sample_images_can5/pic", 10, 1) preprocessing(dataset_gomi, "sample_images_bin1/pic", 10, 2) preprocessing(dataset_gomi, "sample_images_bin2/pic", 10, 2) preprocessing(dataset_gomi, "sample_images_bin3/pic", 20, 2) preprocessing(dataset_gomi, "sample_images_pet1/pic", 10, 3) preprocessing(dataset_gomi, "sample_images_pet2/pic", 10, 3) preprocessing(dataset_gomi, "sample_images_pet3/pic", 10, 3) preprocessing(dataset_gomi, "sample_images_pet4/pic", 10, 3) train, test = split_dataset_random(dataset_gomi, 120, seed=0) model = L.Classifier(MLP(), lossfun=F.softmax_cross_entropy) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).user() model.to_gpu() optimizer = chainer.optimizers.Adam() optimizer.setup(model) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu)) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch')) #trainer.extend(extensions.PlotReport(['main/loss','validation/main/accuracy'],'epoch',file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(extensions.ProgressBar()) trainer.extend(extensions.LogReport()) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run() model.to_cpu() modelname = args.out + "/MLP.model" print('same the trained model: {}'.format(modelname)) chainer.serializers.save_npz("model.npz", model)
def main(): # command line argument parsing parser = argparse.ArgumentParser(description='Multi-Perceptron classifier/regressor') parser.add_argument('dataset', help='Path to data file') parser.add_argument('--activation', '-a', choices=activ.keys(), default='sigmoid', help='Activation function') parser.add_argument('--batchsize', '-b', type=int, default=50, help='Number of samples in each mini-batch') parser.add_argument('--dropout_ratio', '-dr', type=float, default=0, help='dropout ratio') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of sweeps over the dataset to train') parser.add_argument('--snapshot', '-s', type=int, default=-1, help='snapshot interval') parser.add_argument('--label_index', '-l', type=int, default=5, help='Column number of the target variable (5=Melting)') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--outdir', '-o', default='result', help='Directory to output the result') parser.add_argument('--out_ch', '-oc', type=int, default=1, help='num of output channels. set to 1 for regression') parser.add_argument('--optimizer', '-op', default='AdaDelta', help='optimizer {MomentumSGD,AdaDelta,AdaGrad,Adam}') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--skip_columns', '-sc', type=int, default=29, help='num of columns which are not used as explanatory variables') parser.add_argument('--layers', '-nl', type=int, default=3, help='Number of layers') parser.add_argument('--unit', '-nu', type=int, default=100, help='Number of units in the hidden layers') parser.add_argument('--test_every', '-t', type=int, default=5, help='use one in every ? entries in the dataset for validation') parser.add_argument('--predict', action='store_true') parser.add_argument('--weight_decay', '-w', type=float, default=0, help='weight decay for regularization') args = parser.parse_args() args.regress = (args.out_ch == 1) # select numpy or cupy xp = chainer.cuda.cupy if args.gpu >= 0 else np label_type = np.int32 if not args.regress else np.float32 # read csv file dat = pd.read_csv(args.dataset, header=0) ## print('Target: {}, GPU: {} Minibatch-size: {} # epoch: {}'.format(dat.keys()[args.label_index],args.gpu,args.batchsize,args.epoch)) # csvdata = np.loadtxt(args.dataset, delimiter=",", skiprows=args.skip_rows) ind = np.ones(dat.shape[1], dtype=bool) # indices for unused columns dat = dat.dropna(axis='columns') x = dat.iloc[:,args.skip_columns:].values args.in_ch = x.shape[1] t = (dat.iloc[:,args.label_index].values)[:,np.newaxis] print('target column:', args.label_index) # print('excluded columns: {}'.format(np.where(ind==False)[0].tolist())) print("data shape: ",x.shape, t.shape) x = np.array(x, dtype=np.float32) if args.regress: t = np.array(t, dtype=label_type) else: t = np.array(np.ndarray.flatten(t), dtype=label_type) # standardize t_mean = np.mean(t) t_std = np.std(t) x_mean = np.mean(x) x_std = np.std(x) x = (x-x_mean)/x_std t = (t-t_mean)/t_std # Set up a neural network to train model = MLP(args,std=t_std) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() # Make a specified GPU current model.to_gpu() # Copy the model to the GPU # Setup an optimiser if args.optimizer == 'MomentumSGD': optimizer = chainer.optimizers.MomentumSGD(lr=0.003, momentum=0.9) elif args.optimizer == 'AdaDelta': optimizer = chainer.optimizers.AdaDelta(rho=0.95, eps=1e-06) elif args.optimizer == 'AdaGrad': optimizer = chainer.optimizers.AdaGrad(lr=0.001, eps=1e-08) elif args.optimizer == 'Adam': optimizer = chainer.optimizers.Adam(alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-08) else: print("Wrong optimiser") exit(-1) optimizer.setup(model) if args.weight_decay>0: optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) print('layers: {}, units: {}, optimiser: {}, Weight decay: {}, dropout ratio: {}'.format(args.layers,args.unit,args.optimizer,args.weight_decay,args.dropout_ratio)) ## train-validation data # random spliting #train, test = datasets.split_dataset_random(datasets.TupleDataset(x, t), int(0.8*t.size)) # splitting by modulus of index train_idx = [i for i in range(t.size) if (i+1) % args.test_every != 0] var_idx = [i for i in range(t.size) if (i+1) % args.test_every == 0] n = len(train_idx) train_idx.extend(var_idx) train, test = datasets.split_dataset(datasets.TupleDataset(x, t), n, train_idx) # dataset iterator train_iter = iterators.SerialIterator(train, args.batchsize, shuffle=True) test_iter = iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.outdir) frequency = args.epoch if args.snapshot == -1 else max(1, args.snapshot) log_interval = 1, 'epoch' val_interval = frequency/10, 'epoch' trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu),trigger=val_interval) trainer.extend(extensions.dump_graph('main/loss')) if extensions.PlotReport.available(): trainer.extend( extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport( ['main/accuracy', 'validation/main/accuracy'], 'epoch', file_name='accuracy.png')) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/MAE', 'validation/main/MAE', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time']), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) if args.resume: chainer.serializers.load_npz(args.resume, trainer) if not args.predict: trainer.run() else: test = datasets.TupleDataset(x, t) ## prediction print("predicting: {} entries...".format(len(test))) test_iter = iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) converter=concat_examples idx=0 with open(os.path.join(args.outdir,'result.txt'),'w') as output: for batch in test_iter: x, t = converter(batch, device=args.gpu) with chainer.using_config('train', False): with chainer.function.no_backprop_mode(): if args.regress: y = model(x).data if args.gpu>-1: y = xp.asnumpy(y) t = xp.asnumpy(t) y = y * t_std + t_mean t = t * t_std + t_mean else: y = F.softmax(model(x)).data if args.gpu>-1: y = xp.asnumpy(y) t = xp.asnumpy(t) for i in range(y.shape[0]): output.write(str(dat.iloc[var_idx[i],0])) if(len(t.shape)>1): for j in range(t.shape[1]): output.write(",{}".format(t[i,j])) output.write(",{}".format(y[i,j])) else: output.write(",{0:1.5f},{0:1.5f}".format(t[i],y[i])) # output.write(",{0:1.5f}".format(np.argmax(y[i,:]))) # for yy in y[i]: # output.write(",{0:1.5f}".format(yy)) output.write("\n") idx += 1