def _prepare_multinode_snapshot(n, result): n_units = 100 batchsize = 10 comm = create_communicator('naive') model = L.Classifier(MLP(n_units, 10)) optimizer = chainermn.create_multi_node_optimizer( chainer.optimizers.Adam(), comm) optimizer.setup(model) if comm.rank == 0: train, _ = chainer.datasets.get_mnist() else: train, _ = None, None train = chainermn.scatter_dataset(train, comm, shuffle=True) train_iter = chainer.iterators.SerialIterator(train, batchsize) updater = StandardUpdater(train_iter, optimizer) trainer = Trainer(updater, out=result) snapshot = extensions.snapshot(target=updater, autoload=True) replica_sets = [] mn_snapshot = multi_node_snapshot(comm, snapshot, replica_sets) mn_snapshot.initialize(trainer) for _ in range(n): updater.update() return updater, mn_snapshot, trainer
def train(args): logger = logging.getLogger() logger.setLevel(getattr(logging, 'INFO')) logger.addHandler(logging.StreamHandler()) rangelog.set_logger(logger) rangelog.set_start_msg("start... {name}") rangelog.set_end_msg(" end...") with rangelog("creating dataset") as logger: train_set, eval_set = get_cifar10() if args.sample_pairing: train_set = SamplePairingDataset(train_set) with rangelog("creating iterator") as logger: logger.info("train_set: {}, eval_set: {}".format( len(train_set), len(eval_set))) iterator = SerialIterator(train_set, args.batch, repeat=True) eval_iterator = SerialIterator(eval_set, args.batch, repeat=False) with rangelog("creating model") as logger: logger.info('GPU: {}'.format(args.device)) model = Conv(10) chainer.cuda.get_device_from_id(args.device).use() model.to_gpu(args.device) with rangelog("creating optimizer"): optimizer = optimizers.Adam() optimizer.setup(model) with rangelog("creating trainer"): updater = StandardUpdater( iterator=iterator, optimizer=optimizer, device=args.device) trainer = training.Trainer( updater, (args.epoch, 'epoch'), out=args.store) with rangelog("trainer extension") as logger: trainer.extend( extensions.Evaluator( iterator=eval_iterator, target=model, device=args.device)) trainer.extend(extensions.LogReport()) trainer.extend(SourceBackup()) trainer.extend(ArgumentBackup(args)) try: slack = json.load(open("slack.json")) except Exception as e: logger.warn("Error {}".format(e)) else: trainer.extend(SlackPost(slack["token"], slack["channel"])) trainer.extend(extensions.PrintReport(['epoch'] + args.report_keys)) trainer.extend(extensions.ProgressBar(update_interval=1)) trainer.extend( extensions.PlotReport( args.report_keys, 'epoch', file_name='plot.png')) trigger = MinValueTrigger(key='validation/main/loss') snapshoter = snapshot_object(model, filename=args.model_path) trainer.extend(snapshoter, trigger=trigger) with rangelog("training"): trainer.run() return model
def optimize(self, M: np.ndarray, X: np.ndarray): """ Find the optimal affine transformation which minimizes the loss defined in :py:func:`AffineTransformation.get_loss_func`. Args: M: Stacked motion matrix of shape (2 * n_views, 3) X: 3D point cloud of shape (n_points, 3) """ data_iter = iterators.SerialIterator(MotionMatrices(M), self.batchsize) object_iter = iterators.SerialIterator(Objects(X), 1, repeat=False) optimizer = optimizers.MomentumSGD(lr=self.learning_rate) optimizer.setup(self.model) updater = StandardUpdater(data_iter, optimizer, loss_func=self.model.get_loss_func()) log_interval = (1, 'epoch') trainer = chainer.training.Trainer(updater, (self.epoch, 'epoch')) if self.X_eval is not None: trainer.extend(extensions.Evaluator( object_iter, self.model, eval_func=self.get_recornstruction_error_func()), trigger=(1, 'epoch')) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'main/loss', 'reconstruction_error']), trigger=log_interval) trainer.run()
def main(): train_x, train_y, val_x, val_y = load_pascal_voc_dataset(DATASET_ROOT) train_dataset = YoloDataset(train_x, train_y, target_size=model_class.img_size, n_grid=model_class.n_grid, augment=True) test_dataset = YoloDataset(val_x, val_y, target_size=model_class.img_size, n_grid=model_class.n_grid, augment=False) class_weights = [1.0 for i in range(train_dataset.n_classes)] class_weights[0] = 0.2 model = model_class(n_classes=train_dataset.n_classes, n_base_units=6, class_weights=class_weights) if os.path.exists(RESULT_DIR + '/model_last.npz'): print('continue from previous result') chainer.serializers.load_npz(RESULT_DIR + '/model_last.npz', model) optimizer = Adam() optimizer.setup(model) train_iter = SerialIterator(train_dataset, batch_size=BATCH_SIZE) test_iter = SerialIterator(test_dataset, batch_size=BATCH_SIZE, shuffle=False, repeat=False) updater = StandardUpdater(train_iter, optimizer, device=DEVICE) trainer = Trainer(updater, (N_EPOCHS, 'epoch'), out=RESULT_DIR) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.Evaluator(test_iter, model, device=DEVICE)) trainer.extend( extensions.PrintReport([ 'main/loss', 'validation/main/loss', 'main/cl_loss', 'validation/main/cl_loss', 'main/cl_acc', 'validation/main/cl_acc', 'main/pos_loss', 'validation/main/pos_loss', ])) trainer.extend(extensions.snapshot_object(model, 'best_loss.npz'), trigger=triggers.MinValueTrigger('validation/main/loss')) trainer.extend(extensions.snapshot_object(model, 'best_classification.npz'), trigger=triggers.MaxValueTrigger('validation/main/cl_acc')) trainer.extend( extensions.snapshot_object(model, 'best_position.npz'), trigger=triggers.MinValueTrigger('validation/main/pos_loss')) trainer.extend(extensions.snapshot_object(model, 'model_last.npz'), trigger=(1, 'epoch')) trainer.run()
def chainer_model_pipe(self, nn, train, valid, params): epoch = params['epoch'] batch_size = params['batch_size'] use_gpu = params['use_gpu'] if 'fixed_base_w' in params.keys(): fixed_base_w = params['fixed_base_w'] else: fixed_base_w = False # Model Instance model = L.Classifier(nn) if use_gpu: device = 0 model.to_gpu(device) else: device = -1 # ミニバッチのインスタンスを作成 train_iter = SerialIterator(train, batch_size) valid_iter = SerialIterator(valid, batch_size, repeat=False, shuffle=False) # Set Lerning optimizer = Adam() optimizer.setup(model) if fixed_base_w: model.predictor.base.disable_update() updater = StandardUpdater(train_iter, optimizer, device=device) trainer = Trainer(updater, (epoch, 'epoch'), out='result/cat_dog') trainer.extend(Evaluator(valid_iter, model, device=device)) trainer.extend(LogReport(trigger=(1, 'epoch'))) trainer.extend(PrintReport([ 'epoch', 'main/accuracy', 'validation/main/accuracy', 'main/loss', 'validation/main/loss', 'elapsed_time' ]), trigger=(1, 'epoch')) trainer.run() if use_gpu: model.to_cpu() return model
def main(): # input_size: 299 #model = InceptionV4(dim_out=17) #model = InceptionV4(dim_out=17, base_filter_num=6, ablocks=2, bblocks=1, cblocks=1) #model = InceptionResNetV2(dim_out=17) #model = InceptionResNetV2(dim_out=17, base_filter_num=8, ablocks=1, bblocks=2, cblocks=1) # input_size: 224 #model = VGGNetBN(17) # VGGNet original size #model = VGGNetBN(17, 16) # VGGNet 1/4 of filter num #model = GoogLeNetBN(17) # GoogLeNet original size #model = GoogLeNetBN(17, 16) # GoogleNet 1/2 filter num #model = GoogLeNetBN(17, 8) # GoogleNet 1/4 filter num #model = ResNet50(17) # ResNet50 original size #model = ResNet50(17, 32) # ResNet50 1/2 size #model = ResNet50(17, 16) # ResNet50 1/4 size #model = SqueezeNet(17) #SqueezeNet original size #model = SqueezeNet(17, 8) #SqueezeNet 1/2 filter num #model = MobileNet(17) # MobileNet original size #model = MobileNet(17, 16) # MobileNet 1/2 filter num #model = MobileNet(17, 8) # MobileNet 1/4 filter num # input_size: 100 #model = FaceClassifier100x100V2(n_classes=17) model = FaceClassifier100x100V(n_classes=17) optimizer = Adam() optimizer.setup(model) train_dataset = load_dataset('train.tsv', True) test_dataset = load_dataset('test.tsv') train_iter = SerialIterator(train_dataset, batch_size=BATCH_SIZE) test_iter = SerialIterator(test_dataset, batch_size=BATCH_SIZE, shuffle=False, repeat=False) updater = StandardUpdater(train_iter, optimizer, device=DEVICE) trainer = Trainer(updater, (N_EPOCHS, 'epoch'), out='result') trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.Evaluator(test_iter, model, device=DEVICE)) trainer.extend(extensions.PrintReport(['main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'])) #trainer.extend(extensions.snapshot_object(model, 'snapshot_{.updater.epoch}.model')) trainer.run() chainer.serializers.save_npz('result/model.npz', model.to_cpu())
def train(): # train_txt = "/media/common-ns/New Volume/reseach/Dataset/OU-ISIR_by_Setoguchi/CV01.txt" train_dir = "/media/common-ns/New Volume/reseach/Dataset/OU-ISIR_by_Setoguchi/Gallery/CV01(Gallery)/*" train = load_OULP(path_dir=train_dir) # print(train[0]) # 教師データ # train = train[0:1000] train = [i[0] for i in train] # dataのパスとラベルのうち、dataだけ抜き出す train = datasets.TupleDataset(train, train) # 同じパス画像のペアから、dataに変換してタプルにする batch_size = 195 train_iter = chainer.iterators.SerialIterator(train, batch_size=batch_size) #model = L.Classifier(Autoencoder(), lossfun=F.mean_squared_error) model = L.Classifier(CAE(), lossfun=sce_loss) model.compute_accuracy = False optimizer = optimizers.Adam() optimizer.setup(model) updater = StandardUpdater(train_iter, optimizer, device=0) trainer = Trainer( updater, (1000, 'epoch'), out="result", ) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport(['epoch', 'main/loss'])) trainer.extend(extensions.snapshot(), trigger=(200, 'epoch')) trainer.extend(extensions.snapshot_object( target=model, filename='model_snapshot_{.updater.iteration}'), trigger=(250, 'epoch')) trainer.extend(extensions.ProgressBar()) trainer.run() serializers.save_npz( "/home/common-ns/setoguchi/chainer_files/Convolutional_Auto_Encoder/CAE_FC_model", model)
def run_linear_network(loss_fn, alpha=0.3, batch_size=2): # Get data np.random.seed(42) dataset = get_dataset() iterator = SerialIterator(dataset, batch_size, repeat=True, shuffle=True) # Set up network and loss predictor = L.Linear(None, 1) ranker = Ranker(predictor) loss = Loss(ranker, loss_fn) # Optimizer optimizer = Adam(alpha=alpha) optimizer.setup(loss) updater = StandardUpdater(iterator, optimizer, converter=zeropad_concat) trainer = Trainer(updater, (100, 'epoch')) log_report = extensions.LogReport(log_name=None) trainer.extend(log_report) np.random.seed(42) trainer.run() last_ndcg = log_report.log[-1]['ndcg'] return last_ndcg
def main(): parser = argparse.ArgumentParser(description='training mnist') parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of sweeps over the dataset to train') parser.add_argument('--batchsize', '-b', type=int, default=8, help='Number of images in each mini-batch') parser.add_argument('--seed', '-s', type=int, default=0, help='Random seed') parser.add_argument('--report_trigger', '-rt', type=str, default='1e', help='Interval for reporting(Ex.100i, default:1e)') parser.add_argument('--save_trigger', '-st', type=str, default='1e', help='Interval for saving the model(Ex.100i, default:1e)') parser.add_argument('--load_model', '-lm', type=str, default=None, help='Path of the model object to load') parser.add_argument('--load_optimizer', '-lo', type=str, default=None, help='Path of the optimizer object to load') args = parser.parse_args() start_time = datetime.now() save_dir = Path('output/{}'.format(start_time.strftime('%Y%m%d_%H%M'))) random.seed(args.seed) np.random.seed(args.seed) cupy.random.seed(args.seed) backbone = 'mobilenet' model = ModifiedClassifier(DeepLab(n_class=13, task='semantic', backbone=backbone), lossfun=F.softmax_cross_entropy) if args.load_model is not None: serializers.load_npz(args.load_model, model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = optimizers.Adam(alpha=1e-3) optimizer.setup(model) if args.load_optimizer is not None: serializers.load_npz(args.load_optimizer, optimizer) dir_path = './dataset/2D-3D-S/' augmentations = {'mirror': 0.5, 'flip': 0.5} train_data = Stanford2D3DS(dir_path, 'semantic', area='1 2 3 4', train=True) train_data.set_augmentations(crop=513, augmentations=augmentations) valid_data = Stanford2D3DS(dir_path, 'semantic', area='6', train=False, n_data=100) valid_data.set_augmentations(crop=513) train_iter = iterators.MultiprocessIterator(train_data, args.batchsize, n_processes=1) valid_iter = iterators.MultiprocessIterator(valid_data, args.batchsize, repeat=False, shuffle=False, n_processes=1) updater = StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = Trainer(updater, (args.epoch, 'epoch'), out=save_dir) label_list = list(valid_data.label_dict.keys())[1:] report_trigger = (int(args.report_trigger[:-1]), 'iteration' if args.report_trigger[-1] == 'i' else 'epoch') trainer.extend(extensions.LogReport(trigger=report_trigger)) trainer.extend(ModifiedEvaluator(valid_iter, model, label_names=label_list, device=args.gpu), name='val', trigger=report_trigger) trainer.extend(extensions.PrintReport(['epoch', 'iteration', 'main/loss', 'main/acc', 'val/main/loss', 'val/main/acc', 'val/main/mean_class_acc', 'val/main/miou', 'elapsed_time']), trigger=report_trigger) trainer.extend(extensions.PlotReport(['main/loss', 'val/main/loss'], x_key=report_trigger[1], marker='.', file_name='loss.png', trigger=report_trigger)) trainer.extend(extensions.PlotReport(['main/acc', 'val/main/acc'], x_key=report_trigger[1], marker='.', file_name='accuracy.png', trigger=report_trigger)) class_accuracy_report = ['val/main/mean_class_acc'] class_accuracy_report.extend(['val/main/class_acc/{}'.format(label) for label in label_list]) class_iou_report = ['val/main/miou'] class_iou_report.extend(['val/main/iou/{}'.format(label) for label in label_list]) trainer.extend(extensions.PlotReport(class_accuracy_report, x_key=report_trigger[1], marker='.', file_name='class_accuracy.png', trigger=report_trigger)) trainer.extend(extensions.PlotReport(class_iou_report, x_key=report_trigger[1], marker='.', file_name='class_iou.png', trigger=report_trigger)) save_trigger = (int(args.save_trigger[:-1]), 'iteration' if args.save_trigger[-1] == 'i' else 'epoch') trainer.extend(extensions.snapshot_object(model, filename='model_{0}-{{.updater.{0}}}.npz' .format(save_trigger[1])), trigger=save_trigger) trainer.extend(extensions.snapshot_object(optimizer, filename='optimizer_{0}-{{.updater.{0}}}.npz' .format(save_trigger[1])), trigger=save_trigger) if save_dir.exists(): shutil.rmtree(save_dir) save_dir.mkdir() (save_dir / 'training_details').mkdir() # Write parameters text with open(save_dir / 'training_details/train_params.txt', 'w') as f: f.write('model: {}(backbone: {})\n'.format(model.predictor.__class__.__name__, backbone)) f.write('n_epoch: {}\n'.format(args.epoch)) f.write('batch_size: {}\n'.format(args.batchsize)) f.write('n_data_train: {}\n'.format(len(train_data))) f.write('n_data_val: {}\n'.format(len(valid_data))) f.write('seed: {}\n'.format(args.seed)) if len(augmentations) > 0: f.write('[augmentation]\n') for process in augmentations: f.write(' {}: {}\n'.format(process, augmentations[process])) trainer.run()
def main(): args = create_args('train') result_dir = create_result_dir(args.model_name) # Prepare devices devices = get_gpu_dict(args.gpus) # Instantiate a model model = RegNet(epsilon=args.epsilon) # Instantiate a optimizer optimizer = get_optimizer(model, **vars(args)) # Setting up datasets prep = TransformDataset(KITTI(args.kitti_path, 'train'), CalibPrepare(args.init_pose)) train, valid = split_dataset( prep, round(len(prep) * (1 - args.valid_proportion))) print("========== Model Parameters ==========") print("location loss weight (epsilon):", args.epsilon) print('train samples: {}, valid samples: {}'.format( len(train), len(valid))) # Iterator if DEBUG: Iterator = SerialIterator else: Iterator = MultiprocessIterator train_iter = Iterator(train, args.batchsize) valid_iter = Iterator(valid, args.valid_batchsize, repeat=False, shuffle=False) # Updater if DEBUG: Updater = StandardUpdater(train_iter, optimizer, device=devices['main']) else: Updater = ParallelUpdater(train_iter, optimizer, devices=devices) trainer = Trainer(Updater, (args.epoch, 'epoch'), out=result_dir) # Extentions trainer.extend(extensions.Evaluator(valid_iter, model, device=devices['main']), trigger=(args.valid_freq, 'epoch')) trainer.extend(extensions.snapshot(), trigger=(args.snapshot_iter, 'iteration')) trainer.extend(extensions.LogReport(), trigger=(args.show_log_iter, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=20)) trainer.extend( extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'elapsed_time' ])) # Resume from snapshot if args.resume_from: chainer.serializers.load_npz(args.resume_from, trainer) # Train and save print("========== Training ==========") hook = CupyMemoryProfileHook() with hook: trainer.run() print("========== Saving ==========") chainer.serializers.save_hdf5(create_result_file(args.model_name), model) print("Done.") print("========== Memory Profiling ==========") hook.print_report()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--step_size', '-ss', type=int, default=3000, help='step_size for lr exponential') parser.add_argument('--gradclip', '-c', type=float, default=5, help='Gradient norm threshold to clip') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--pretrain', '-pr', default='', help='Resume the training from snapshot') parser.add_argument('--snapshot', '-snap', type=int, default=100, help='snapshot iteration for save checkpoint') parser.add_argument('--test_mode', action='store_true', help='Use tiny datasets for quick tests') parser.add_argument('--valid', '-val', default='', help='Test directory path contains test txt file') parser.add_argument('--test', '-tt', default='graph_test', help='Test directory path contains test txt file') parser.add_argument('--train', '-tr', default="D:/toy/", help='Train directory path contains train txt file') parser.add_argument('--train_edge', default="all", help="train temporal/all to comparision") parser.add_argument('--database', default="BP4D", help="BP4D/DISFA") parser.add_argument( '--use_pure_python', action='store_true', help= 'you can use pure python code to check whether your optimized code works correctly' ) parser.add_argument('--lr', '-l', type=float, default=0.1) parser.add_argument("--profile", "-p", action="store_true", help="whether to profile to examine speed bottleneck") parser.add_argument("--num_attrib", type=int, default=2048, help="node feature dimension") parser.add_argument("--need_cache_graph", "-ng", action="store_true", help="whether to cache factor graph to LRU cache") parser.add_argument("--eval_mode", '-eval', action="store_true", help="whether to evaluation or not") parser.add_argument("--proc_num", "-pn", type=int, default=1) parser.add_argument("--resume", action="store_true", help="resume from pretrained model") parser.set_defaults(test=False) args = parser.parse_args() config.OPEN_CRF_CONFIG["use_pure_python"] = args.use_pure_python # because we modify config.OPEN_CRF_CONFIG thus will influence the open_crf layer from graph_learning.dataset.crf_pact_structure import CRFPackageStructure from graph_learning.dataset.graph_dataset import GraphDataset from graph_learning.extensions.opencrf_evaluator import OpenCRFEvaluator from graph_learning.dataset.graph_dataset_reader import GlobalDataSet from graph_learning.updater.bptt_updater import convert from graph_learning.extensions.AU_roi_label_split_evaluator import ActionUnitEvaluator if args.use_pure_python: from graph_learning.model.open_crf.pure_python.open_crf_layer import OpenCRFLayer else: from graph_learning.model.open_crf.cython.open_crf_layer import OpenCRFLayer print_interval = 1, 'iteration' val_interval = (5, 'iteration') adaptive_AU_database(args.database) root_dir = os.path.dirname(os.path.dirname(args.train)) dataset = GlobalDataSet(num_attrib=args.num_attrib, train_edge=args.train_edge) file_name = list( filter(lambda e: e.endswith(".txt"), os.listdir(args.train)))[0] sample = dataset.load_data(args.train + os.sep + file_name) print("pre load done") crf_pact_structure = CRFPackageStructure( sample, dataset, num_attrib=dataset.num_attrib_type, need_s_rnn=False) model = OpenCRFLayer(node_in_size=dataset.num_attrib_type, weight_len=crf_pact_structure.num_feature) train_str = args.train if train_str[-1] == "/": train_str = train_str[:-1] trainer_keyword = os.path.basename(train_str) trainer_keyword_tuple = tuple(trainer_keyword.split("_")) LABEL_SPLIT = config.BP4D_LABEL_SPLIT if args.database == "BP4D" else config.DISFA_LABEL_SPLIT if trainer_keyword_tuple not in LABEL_SPLIT: return # assert "_" in trainer_keyword train_data = GraphDataset(args.train, attrib_size=dataset.num_attrib_type, global_dataset=dataset, need_s_rnn=False, need_cache_factor_graph=args.need_cache_graph, get_geometry_feature=False) if args.proc_num == 1: train_iter = chainer.iterators.SerialIterator(train_data, 1, shuffle=True) elif args.proc_num > 1: train_iter = chainer.iterators.MultiprocessIterator( train_data, batch_size=1, n_processes=args.proc_num, repeat=True, shuffle=True, n_prefetch=10, shared_mem=31457280) optimizer = chainer.optimizers.SGD(lr=args.lr) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005)) updater = StandardUpdater(train_iter, optimizer, converter=convert) trainer = chainer.training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) interval = 1 if args.test_mode: chainer.config.train = False trainer.extend( PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', "opencrf_val/main/hit", #"opencrf_validation/main/U_hit", "opencrf_val/main/miss", #"opencrf_validation/main/U_miss", "opencrf_val/main/F1", #"opencrf_validation/main/U_F1" 'opencrf_val/main/accuracy', ]), trigger=print_interval) trainer.extend(chainer.training.extensions.observe_lr(), trigger=print_interval) trainer.extend( chainer.training.extensions.LogReport( trigger=print_interval, log_name="open_crf_{}.log".format(trainer_keyword))) optimizer_snapshot_name = "{0}_{1}_opencrf_optimizer.npz".format( trainer_keyword, args.database) model_snapshot_name = "{0}_{1}_opencrf_model.npz".format( trainer_keyword, args.database) trainer.extend(chainer.training.extensions.snapshot_object( optimizer, filename=optimizer_snapshot_name), trigger=(args.snapshot, 'iteration')) trainer.extend(chainer.training.extensions.snapshot_object( model, filename=model_snapshot_name), trigger=(args.snapshot, 'iteration')) if args.resume and os.path.exists(args.out + os.sep + model_snapshot_name): print("loading model_snapshot_name to model") chainer.serializers.load_npz(args.out + os.sep + model_snapshot_name, model) if args.resume and os.path.exists(args.out + os.sep + optimizer_snapshot_name): print("loading optimizer_snapshot_name to optimizer") chainer.serializers.load_npz( args.out + os.sep + optimizer_snapshot_name, optimizer) # trainer.extend(chainer.training.extensions.ProgressBar(update_interval=1)) # trainer.extend(chainer.training.extensions.snapshot(), # trigger=(args.snapshot, 'epoch')) # trainer.extend(chainer.training.extensions.ExponentialShift('lr', 0.9), trigger=(1, 'epoch')) if chainer.training.extensions.PlotReport.available(): trainer.extend(chainer.training.extensions.PlotReport( ['main/loss'], file_name="{}_train_loss.png".format(trainer_keyword)), trigger=(100, "iteration")) trainer.extend(chainer.training.extensions.PlotReport( ['opencrf_val/F1', 'opencrf_val/accuracy'], file_name="{}_val_f1.png".format(trainer_keyword)), trigger=val_interval) if args.valid: valid_data = GraphDataset( args.valid, attrib_size=dataset.num_attrib_type, global_dataset=dataset, need_s_rnn=False, need_cache_factor_graph=args.need_cache_graph) validate_iter = chainer.iterators.SerialIterator(valid_data, 1, repeat=False, shuffle=False) evaluator = OpenCRFEvaluator(iterator=validate_iter, target=model, device=-1) trainer.extend(evaluator, trigger=val_interval) if args.profile: cProfile.runctx("trainer.run()", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats() else: trainer.run()
def main(): parser = argparse.ArgumentParser(description='training mnist') parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--epoch', '-e', type=int, default=300, help='Number of sweeps over the dataset to train') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--seed', '-s', type=int, default=0, help='Random seed') parser.add_argument('--report_trigger', '-rt', type=str, default='1e', help='Interval for reporting (Ex.100i/1e)') parser.add_argument('--save_trigger', '-st', type=str, default='1e', help='Interval for saving the model (Ex.100i/1e)') parser.add_argument('--load_model', '-lm', type=str, default=None, help='Path of the model object to load') parser.add_argument('--load_optimizer', '-lo', type=str, default=None, help='Path of the optimizer object to load') args = parser.parse_args() if not Path('output').exists(): Path('output').mkdir() start_time = datetime.now() save_dir = Path('output/{}'.format(start_time.strftime('%Y%m%d_%H%M'))) random.seed(args.seed) np.random.seed(args.seed) cupy.random.seed(args.seed) chainer.config.cudnn_deterministic = True model = L.Classifier(SEResNet50(n_class=101)) # model = L.Classifier(SERes2Net50(n_class=101)) # model = L.Classifier(GCResNet50(n_class=101)) # model = L.Classifier(AAResNet50(n_class=101)) if args.load_model is not None: serializers.load_npz(args.load_model, model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = optimizers.Adam(alpha=1e-3, weight_decay_rate=1e-4, amsgrad=True) optimizer.setup(model) if args.load_optimizer is not None: serializers.load_npz(args.load_optimizer, optimizer) augmentation = { 'HorizontalFlip': { 'p': 0.5 }, 'PadIfNeeded': { 'p': 1.0, 'min_height': 512, 'min_width': 512 }, 'Rotate': { 'p': 1.0, 'limit': 15, 'interpolation': 1 }, 'Resize': { 'p': 1.0, 'height': 248, 'width': 248, 'interpolation': 2 }, 'RandomScale': { 'p': 1.0, 'scale_limit': 0.09, 'interpolation': 2 }, 'RandomCrop': { 'p': 1.0, 'height': 224, 'width': 224 }, } resize = { 'PadIfNeeded': { 'p': 1.0, 'min_height': 512, 'min_width': 512 }, 'Resize': { 'p': 1.0, 'height': 224, 'width': 224, 'interpolation': 2 } } sl = slice(0, None, 5) train_data = Food101Dataset(augmentation=augmentation, drop_index=sl) valid_data = Food101Dataset(augmentation=resize, index=sl) train_iter = iterators.SerialIterator(train_data, args.batchsize) valid_iter = iterators.SerialIterator(valid_data, args.batchsize, repeat=False, shuffle=False) updater = StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = Trainer(updater, (args.epoch, 'epoch'), out=save_dir) report_trigger = (int(args.report_trigger[:-1]), 'iteration' if args.report_trigger[-1] == 'i' else 'epoch') trainer.extend(extensions.LogReport(trigger=report_trigger)) trainer.extend(extensions.Evaluator(valid_iter, model, device=args.gpu), name='val', trigger=report_trigger) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'main/accuracy', 'val/main/loss', 'val/main/accuracy', 'elapsed_time' ]), trigger=report_trigger) trainer.extend( extensions.PlotReport(['main/loss', 'val/main/loss'], x_key=report_trigger[1], marker='.', file_name='loss.png', trigger=report_trigger)) trainer.extend( extensions.PlotReport(['main/accuracy', 'val/main/accuracy'], x_key=report_trigger[1], marker='.', file_name='accuracy.png', trigger=report_trigger)) save_trigger = (int(args.save_trigger[:-1]), 'iteration' if args.save_trigger[-1] == 'i' else 'epoch') trainer.extend(extensions.snapshot_object( model, filename='model_{0}-{{.updater.{0}}}.npz'.format(save_trigger[1])), trigger=save_trigger) trainer.extend(extensions.snapshot_object( optimizer, filename='optimizer_{0}-{{.updater.{0}}}.npz'.format(save_trigger[1])), trigger=save_trigger) trainer.extend(extensions.ProgressBar()) if save_dir.exists(): shutil.rmtree(save_dir) save_dir.mkdir() # Write parameters text with open(save_dir / 'train_params.txt', 'w') as f: f.write('model: {}\n'.format(model.predictor.__class__.__name__)) f.write('n_epoch: {}\n'.format(args.epoch)) f.write('batch_size: {}\n'.format(args.batchsize)) f.write('seed: {}\n'.format(args.seed)) f.write('n_data_train: {}\n'.format(len(train_data))) f.write('n_data_val: {}\n'.format(len(valid_data))) f.write('augmentation: \n') for k, v in augmentation.items(): f.write(' {}: {}\n'.format(k, v)) trainer.run()
def main(arg_list=None): parser = argparse.ArgumentParser(description='Chainer LSTM') parser.add_argument('--epoch', '-e', type=int, nargs='+', default=[20], help='Number of sweeps over the dataset to train') parser.add_argument('--optimizer', '-o', nargs='+', default=['momentumsgd'], help='Optimizer (sgd, momentumsgd, adam)') parser.add_argument('--batchsize', '-b', type=int, nargs='+', default=[128], help='Number of training points in each mini-batch') parser.add_argument('--lr', type=float, nargs='+', default=[1e-2, 1e-3, 1e-4, 1e-5], help='Learning rate') parser.add_argument( '--network', '-n', default='ff', help= 'Neural network type, either "ff", "tdnn", "lstm", "zoneoutlstm", "peepholelstm" or "gru". Setting any recurrent network implies "--shuffle-sequences"' ) parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--units', '-u', type=int, nargs='+', default=[1024], help='Number of units') parser.add_argument('--layers', '-l', type=int, default=2, help='Number of hidden layers') parser.add_argument('--activation', '-a', default='relu', help='FF activation function (sigmoid, tanh or relu)') parser.add_argument('--tdnn-ksize', type=int, nargs='+', default=[5], help='TDNN kernel size') parser.add_argument('--bproplen', type=int, default=20, help='Backpropagation length') parser.add_argument('--timedelay', type=int, default=0, help='Delay target values by this many time steps') parser.add_argument('--noplot', dest='plot', action='store_false', help='Disable PlotReport extension') parser.add_argument('--splice', type=int, default=0, help='Splicing size') parser.add_argument( '--dropout', '-d', type=float, nargs='+', default=[0], help= 'Dropout rate (0 to disable). In case of Zoneout LSTM, this parameter has 2 arguments: c_ratio h_ratio' ) parser.add_argument('--ft', default='final.feature_transform', help='Kaldi feature transform file') parser.add_argument('--tri', action='store_true', help='Use triphones') parser.add_argument( '--shuffle-sequences', action='store_true', help= 'True if sequences should be shuffled as a whole, otherwise all frames will be shuffled independent of each other' ) parser.add_argument( '--data-dir', default='data/fmllr', help= 'Data directory, this will be prepended to data files and feature transform' ) parser.add_argument( '--offset-dir', default='data', help='Data directory, this will be prepended to offset files') parser.add_argument( '--target-dir', default='data/targets', help='Data directory, this will be prepended to target files') parser.add_argument( '--ivector-dir', help='Data directory, this will be prepended to ivector files') parser.add_argument('--data', default='data_{}.npy', help='Training data') parser.add_argument('--offsets', default='offsets_{}.npy', help='Training offsets') parser.add_argument('--targets', default='targets_{}.npy', help='Training targets') parser.add_argument('--ivectors', default='ivectors_{}.npy', help='Training ivectors') parser.add_argument('--no-validation', dest='use_validation', action='store_false', help='Do not evaluate validation data while training') parser.add_argument('--train-fold', type=int, help='Train fold network with this ID') parser.add_argument('--train-rpl', action='store_true', help='Train RPL layer') parser.add_argument('--rpl-model', default="result_rpl/model", help='RPL layer model') parser.add_argument('--fold-data-dir', default="fold_data", help='Directory with fold input data') parser.add_argument('--fold-output-dir', default="fold_data_out", help='Directory with predicted fold output') parser.add_argument('--fold-model-dir', default="fold_models", help='Directory with output fold model') parser.add_argument( '--fold-data-pattern', default='data_{0}.npy', help= 'Filename pattern of each fold data, {0} will be replaced by fold ID') parser.add_argument('--fold-offset-pattern', default='offsets_{0}.npy', help='Filename pattern of each fold offset') parser.add_argument('--fold-target-pattern', default='targets_{0}.npy', help='Filename pattern of each fold targets') parser.add_argument( '--fold-ivector-pattern', default='ivectors_{}.npy', help= 'Filename pattern of each fold i-vectors file, {} will be replaced by fold ID' ) parser.add_argument('--fold-output-pattern', default='data_{0}.npy', help='Filename pattern of each fold network output') parser.add_argument('--fold-network-pattern', default='fold_{0}.npz', help='Filename pattern of each fold network') parser.add_argument('--no-progress', action='store_true', help='Disable progress bar') if arg_list is not None: args = parser.parse_args(list(map(str, arg_list))) else: args = parser.parse_args() # set options implied by other options if is_nn_recurrent(args.network): args.shuffle_sequences = True # create output directories Path(args.out).mkdir(exist_ok=True, parents=True) if args.train_fold is not None: file_out = Path(args.fold_model_dir, args.fold_network_pattern.format(args.train_fold)) Path(file_out.parent).mkdir(exist_ok=True, parents=True) # print arguments to the file with open(args.out + "/args.txt", "w") as f: for attr in dir(args): if not attr.startswith('_'): f.write('# {}: {}\n'.format(attr, getattr(args, attr))) f.write(' '.join( map(lambda x: "'" + x + "'" if ' ' in x else x, sys.argv)) + '\n') # print arguments to stdout for attr in dir(args): if not attr.startswith('_'): print('# {}: {}'.format(attr, getattr(args, attr))) print('') # input feature vector length num_classes = 1909 if args.tri else 39 # create model if args.train_rpl: model = RPL4(num_classes) model_cls = L.Classifier(model) else: if args.activation == "sigmoid": activation = F.sigmoid elif args.activation == "tanh": activation = F.tanh elif args.activation == "relu": activation = F.relu else: print("Wrong activation function specified") return model = get_nn(args.network, args.layers, args.units, num_classes, activation, args.tdnn_ksize, args.dropout) # classifier reports softmax cross entropy loss and accuracy at every # iteration, which will be used by the PrintReport extension below. model_cls = L.Classifier(model) if args.gpu >= 0: # make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model_cls.to_gpu() # copy the model to the GPU offsets = offsets_dev = None if args.train_rpl: # load training data fold = 0 x = [] y = [] while True: x_file = Path(args.fold_output_dir, args.fold_output_pattern.format(fold)) y_file = Path(args.fold_data_dir, args.fold_target_pattern.format(fold)) if not x_file.is_file() or not y_file.is_file(): break print("Loading fold {} data".format(fold)) x_ = np.load(str(x_file)) y_ = np.load(str(y_file)) x.append(x_) y.append(y_) fold += 1 if fold == 0: print("Error: No fold data found") return x = np.concatenate(x, axis=0) y = np.concatenate(y, axis=0) if args.use_validation: #TODO: use args.data instead of args.dev_data x_dev = np.load(str(Path(args.data_dir, args.data.format("dev")))) # offsets_dev = loadBin(str(Path(args.datadir, args.dev_offsets)), np.int32) y_dev = np.load( str(Path(args.target_dir, args.targets.format("dev")))) else: # load training data ivectors = None ivectors_dev = None if args.train_fold is not None: x = [] offsets = [0] y = [] ivectors = [] num = 0 fold = 0 while True: if fold != args.train_fold: x_file = Path(args.fold_data_dir, args.fold_data_pattern.format(fold)) if not x_file.is_file(): break offsets_file = Path(args.fold_data_dir, args.fold_offset_pattern.format(fold)) y_file = Path(args.fold_data_dir, args.fold_target_pattern.format(fold)) if args.ivector_dir is not None: ivectors_file = Path( args.fold_data_dir, args.fold_ivector_pattern.format(fold)) if not ivectors_file.is_file(): print("Error: missing ivectors for fold data {}". format(fold)) return print("Loading fold {} data".format(fold)) x_fold = np.load(str(x_file)) x.append(x_fold) if is_nn_recurrent(args.network): offsets_fold = np.load(str(offsets_file)) offsets.extend(offsets_fold[1:] + num) y_fold = np.load(str(y_file)) y.append(y_fold) if args.ivector_dir is not None: ivectors_fold = np.load(str(ivectors_file)) ivectors.append(ivectors_fold) num += x_fold.shape[0] fold += 1 if len(x) == 0: print("Error: No fold data found") return x = np.concatenate(x, axis=0) if is_nn_recurrent(args.network): offsets = np.array(offsets, dtype=np.int32) y = np.concatenate(y, axis=0) if args.ivector_dir is not None: ivectors = np.concatenate(ivectors, axis=0) else: x = np.load(str(Path(args.data_dir, args.data.format("train")))) if is_nn_recurrent(args.network): offsets = np.load( str(Path(args.offset_dir, args.offsets.format("train")))) y = np.load( str(Path(args.target_dir, args.targets.format("train")))) if args.ivector_dir is not None: ivectors = np.load( str(Path(args.ivector_dir, args.ivectors.format("train")))) if args.use_validation: x_dev = np.load(str(Path(args.data_dir, args.data.format("dev")))) if is_nn_recurrent(args.network): offsets_dev = np.load( str(Path(args.offset_dir, args.offsets.format("dev")))) y_dev = np.load( str(Path(args.target_dir, args.targets.format("dev")))) if args.ivector_dir is not None: ivectors_dev = np.load( str(Path(args.ivector_dir, args.ivectors.format("dev")))) # apply splicing if args.network == "tdnn": splice = (sum(args.tdnn_ksize) - len(args.tdnn_ksize)) // 2 else: splice = args.splice if splice > 0: x = splicing(x, range(-splice, splice + 1)) x_dev = splicing(x_dev, range(-splice, splice + 1)) # load feature transform if not args.ft and args.ft != '-': ft = loadKaldiFeatureTransform(str(Path(args.data_dir, args.ft))) if is_nn_recurrent( args.network ): # select transform middle frame if the network is recurrent dim = ft["shape"][1] zi = ft["shifts"].index(0) ft["rescale"] = ft["rescale"][zi * dim:(zi + 1) * dim] ft["addShift"] = ft["addShift"][zi * dim:(zi + 1) * dim] ft["shape"][0] = dim ft["shifts"] = [0] elif args.network == "tdnn": dim = ft["shape"][1] zi = ft["shifts"].index(0) winlen = 2 * splice + 1 ft["rescale"] = np.tile(ft["rescale"][zi * dim:(zi + 1) * dim], winlen) ft["addShift"] = np.tile( ft["addShift"][zi * dim:(zi + 1) * dim], winlen) ft["shape"][0] = dim * winlen ft["shifts"] = list(range(-splice, splice + 1)) # apply feature transform x = applyKaldiFeatureTransform(x, ft) if args.use_validation: x_dev = applyKaldiFeatureTransform(x_dev, ft) if ivectors is not None: x = np.concatenate((x, ivectors), axis=1) if ivectors_dev is not None: x_dev = np.concatenate((x_dev, ivectors_dev), axis=1) # shift the input dataset according to time delay if is_nn_recurrent(args.network) and args.timedelay != 0: x, y, offsets = apply_time_delay(x, y, offsets, args.timedelay) if args.use_validation: x_dev, y_dev, offsets_dev = apply_time_delay( x_dev, y_dev, offsets_dev, args.timedelay) # create chainer datasets train_dataset = chainer.datasets.TupleDataset(x, y) if args.use_validation: dev_dataset = chainer.datasets.TupleDataset(x_dev, y_dev) # prepare train stages train_stages_len = max(len(args.batchsize), len(args.lr)) train_stages = [{ 'epoch': index_padded(args.epoch, i), 'opt': index_padded(args.optimizer, i), 'bs': index_padded(args.batchsize, i), 'lr': index_padded(args.lr, i) } for i in range(train_stages_len)] for i, ts in enumerate(train_stages): if ts['opt'] == 'adam': # learning rate not used, don't print it print( "=== Training stage {}: epoch = {}, batchsize = {}, optimizer = {}" .format(i, ts['epoch'], ts['bs'], ts['opt'])) else: print( "=== Training stage {}: epoch = {}, batchsize = {}, optimizer = {}, learning rate = {}" .format(i, ts['epoch'], ts['bs'], ts['opt'], ts['lr'])) # reset state to allow training with different batch size in each stage if not args.train_rpl and is_nn_recurrent(args.network): model.reset_state() # setup an optimizer if ts['opt'] == "sgd": optimizer = chainer.optimizers.SGD(lr=ts['lr']) elif ts['opt'] == "momentumsgd": optimizer = chainer.optimizers.MomentumSGD(lr=ts['lr']) elif ts['opt'] == "adam": optimizer = chainer.optimizers.Adam() else: print("Wrong optimizer specified: {}".format(ts['opt'])) exit(1) optimizer.setup(model_cls) if args.shuffle_sequences: train_iter = SequenceShuffleIterator(train_dataset, offsets, ts['bs']) if args.use_validation: dev_iter = SequenceShuffleIterator(dev_dataset, None, ts['bs'], repeat=False, shuffle=False) else: train_iter = SerialIterator(train_dataset, ts['bs']) if args.use_validation: dev_iter = SerialIterator(dev_dataset, ts['bs'], repeat=False, shuffle=False) # set up a trainer if is_nn_recurrent(args.network): updater = BPTTUpdater(train_iter, optimizer, args.bproplen, device=args.gpu) else: updater = StandardUpdater(train_iter, optimizer, device=args.gpu) if args.use_validation: stop_trigger = EarlyStoppingTrigger(ts['epoch'], key='validation/main/loss', eps=-0.001) else: stop_trigger = (ts['epoch'], 'epoch') trainer = training.Trainer(updater, stop_trigger, out="{}/{}".format(args.out, i)) trainer.extend(model_saver) # evaluate the model with the development dataset for each epoch if args.use_validation: trainer.extend( extensions.Evaluator(dev_iter, model_cls, device=args.gpu)) # dump a computational graph from 'loss' variable at the first iteration # the "main" refers to the target link of the "main" optimizer. trainer.extend(extensions.dump_graph('main/loss')) # take a snapshot for each specified epoch frequency = ts['epoch'] if args.frequency == -1 else max( 1, args.frequency) trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) # write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) # save two plot images to the result dir if args.plot and extensions.PlotReport.available(): plot_vars_loss = ['main/loss'] plot_vars_acc = ['main/accuracy'] if args.use_validation: plot_vars_loss.append('validation/main/loss') plot_vars_acc.append('validation/main/accuracy') trainer.extend( extensions.PlotReport(plot_vars_loss, 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(plot_vars_acc, 'epoch', file_name='accuracy.png')) # print selected entries of the log to stdout # here "main" refers to the target link of the "main" optimizer again, and # "validation" refers to the default name of the Evaluator extension. # entries other than 'epoch' are reported by the Classifier link, called by # either the updater or the evaluator. if args.use_validation: print_report_vars = [ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ] else: print_report_vars = [ 'epoch', 'main/loss', 'main/accuracy', 'elapsed_time' ] trainer.extend(extensions.PrintReport(print_report_vars)) # print a progress bar to stdout # trainer.extend(extensions.ProgressBar()) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # Run the training trainer.run() # load the last model if the max epoch was not reached (that means early stopping trigger stopped training # because the validation loss increased) if updater.epoch_detail < ts['epoch']: chainer.serializers.load_npz("{}/{}/model_tmp".format(args.out, i), model_cls) # remove temporary model from this training stage os.remove("{}/{}/model_tmp".format(args.out, i)) # save the final model chainer.serializers.save_npz("{}/model".format(args.out), model_cls) if args.train_fold is not None: chainer.serializers.save_npz( str( Path(args.fold_model_dir, args.fold_network_pattern.format(args.train_fold))), model_cls)
def main(): parser = argparse.ArgumentParser(description='training mnist') parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--epoch', '-e', type=int, default=300, help='Number of sweeps over the dataset to train') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--seed', '-s', type=int, default=0, help='Random seed') parser.add_argument('--n_fold', '-nf', type=int, default=5, help='n_fold cross validation') parser.add_argument('--fold', '-f', type=int, default=1) parser.add_argument('--out_dir_name', '-dn', type=str, default=None, help='Name of the output directory') parser.add_argument('--report_trigger', '-rt', type=str, default='1e', help='Interval for reporting(Ex.100i, default:1e)') parser.add_argument('--save_trigger', '-st', type=str, default='1e', help='Interval for saving the model' '(Ex.100i, default:1e)') parser.add_argument('--load_model', '-lm', type=str, default=None, help='Path of the model object to load') parser.add_argument('--load_optimizer', '-lo', type=str, default=None, help='Path of the optimizer object to load') args = parser.parse_args() if args.out_dir_name is None: start_time = datetime.now() out_dir = Path('output/{}'.format(start_time.strftime('%Y%m%d_%H%M'))) else: out_dir = Path('output/{}'.format(args.out_dir_name)) random.seed(args.seed) np.random.seed(args.seed) cupy.random.seed(args.seed) chainer.config.cudnn_deterministic = True # model = ModifiedClassifier(SEResNeXt50()) # model = ModifiedClassifier(SERes2Net50()) model = ModifiedClassifier(SEResNeXt101()) if args.load_model is not None: serializers.load_npz(args.load_model, model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = optimizers.MomentumSGD(lr=0.1, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(1e-4)) if args.load_optimizer is not None: serializers.load_npz(args.load_optimizer, optimizer) n_fold = args.n_fold slices = [slice(i, None, n_fold) for i in range(n_fold)] fold = args.fold - 1 # model1 # augmentation = [ # ('Rotate', {'p': 0.8, 'limit': 5}), # ('PadIfNeeded', {'p': 0.5, 'min_height': 28, 'min_width': 30}), # ('PadIfNeeded', {'p': 0.5, 'min_height': 30, 'min_width': 28}), # ('Resize', {'p': 1.0, 'height': 28, 'width': 28}), # ('RandomScale', {'p': 1.0, 'scale_limit': 0.1}), # ('PadIfNeeded', {'p': 1.0, 'min_height': 32, 'min_width': 32}), # ('RandomCrop', {'p': 1.0, 'height': 28, 'width': 28}), # ('Mixup', {'p': 0.5}), # ('Cutout', {'p': 0.5, 'num_holes': 4, 'max_h_size': 4, # 'max_w_size': 4}), # ] # resize = None # model2 augmentation = [ ('Rotate', { 'p': 0.8, 'limit': 5 }), ('PadIfNeeded', { 'p': 0.5, 'min_height': 28, 'min_width': 32 }), ('PadIfNeeded', { 'p': 0.5, 'min_height': 32, 'min_width': 28 }), ('Resize', { 'p': 1.0, 'height': 32, 'width': 32 }), ('RandomScale', { 'p': 1.0, 'scale_limit': 0.1 }), ('PadIfNeeded', { 'p': 1.0, 'min_height': 36, 'min_width': 36 }), ('RandomCrop', { 'p': 1.0, 'height': 32, 'width': 32 }), ('Mixup', { 'p': 0.5 }), ('Cutout', { 'p': 0.5, 'num_holes': 4, 'max_h_size': 4, 'max_w_size': 4 }), ] resize = [('Resize', {'p': 1.0, 'height': 32, 'width': 32})] train_data = KMNIST(augmentation=augmentation, drop_index=slices[fold], pseudo_labeling=True) valid_data = KMNIST(augmentation=resize, index=slices[fold]) train_iter = iterators.SerialIterator(train_data, args.batchsize) valid_iter = iterators.SerialIterator(valid_data, args.batchsize, repeat=False, shuffle=False) updater = StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = Trainer(updater, (args.epoch, 'epoch'), out=out_dir) report_trigger = (int(args.report_trigger[:-1]), 'iteration' if args.report_trigger[-1] == 'i' else 'epoch') trainer.extend(extensions.LogReport(trigger=report_trigger)) trainer.extend(extensions.Evaluator(valid_iter, model, device=args.gpu), name='val', trigger=report_trigger) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'main/accuracy', 'val/main/loss', 'val/main/accuracy', 'elapsed_time' ]), trigger=report_trigger) trainer.extend( extensions.PlotReport(['main/loss', 'val/main/loss'], x_key=report_trigger[1], marker='.', file_name='loss.png', trigger=report_trigger)) trainer.extend( extensions.PlotReport(['main/accuracy', 'val/main/accuracy'], x_key=report_trigger[1], marker='.', file_name='accuracy.png', trigger=report_trigger)) save_trigger = (int(args.save_trigger[:-1]), 'iteration' if args.save_trigger[-1] == 'i' else 'epoch') trainer.extend(extensions.snapshot_object( model, filename='model_{0}-{{.updater.{0}}}.npz'.format(save_trigger[1])), trigger=save_trigger) trainer.extend(extensions.snapshot_object( optimizer, filename='optimizer_{0}-{{.updater.{0}}}.npz'.format(save_trigger[1])), trigger=save_trigger) trainer.extend(extensions.ProgressBar()) trainer.extend(CosineAnnealing(lr_max=0.1, lr_min=1e-6, T_0=20), trigger=(1, 'epoch')) best_model_trigger = triggers.MaxValueTrigger('val/main/accuracy', trigger=(1, 'epoch')) trainer.extend(extensions.snapshot_object(model, filename='best_model.npz'), trigger=best_model_trigger) trainer.extend(extensions.snapshot_object(optimizer, filename='best_optimizer.npz'), trigger=best_model_trigger) best_loss_model_trigger = triggers.MinValueTrigger('val/main/loss', trigger=(1, 'epoch')) trainer.extend(extensions.snapshot_object(model, filename='best_loss_model.npz'), trigger=best_loss_model_trigger) trainer.extend(extensions.snapshot_object( optimizer, filename='best_loss_optimizer.npz'), trigger=best_loss_model_trigger) if out_dir.exists(): shutil.rmtree(out_dir) out_dir.mkdir() # Write parameters text with open(out_dir / 'train_params.txt', 'w') as f: f.write('model: {}\n'.format(model.predictor.__class__.__name__)) f.write('n_epoch: {}\n'.format(args.epoch)) f.write('batch_size: {}\n'.format(args.batchsize)) f.write('n_data_train: {}\n'.format(len(train_data))) f.write('n_data_val: {}\n'.format(len(valid_data))) f.write('seed: {}\n'.format(args.seed)) f.write('n_fold: {}\n'.format(args.n_fold)) f.write('fold: {}\n'.format(args.fold)) f.write('augmentation: \n') for process, param in augmentation: f.write(' {}: {}\n'.format(process, param)) trainer.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--batchsize', type=int, default=2) parser.add_argument('--epoch', type=int, default=10) parser.add_argument('--mini', action="store_true") parser.add_argument('--input_size', type=int, default=512) args = parser.parse_args() dtype = np.float32 num_class = len(voc_bbox_label_names) data_augmentation_transform = DataAugmentationTransform(args.input_size) center_detection_transform = CenterDetectionTransform(args.input_size, num_class, 4, dtype=dtype) train = TransformDataset( ConcatenatedDataset(VOCBboxDataset(year='2007', split='trainval'), VOCBboxDataset(year='2012', split='trainval')), data_augmentation_transform) train = TransformDataset(train, center_detection_transform) if args.mini: train = datasets.SubDataset(train, 0, 100) train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize) test = VOCBboxDataset(year='2007', split='test', use_difficult=True, return_difficult=True) if args.mini: test = datasets.SubDataset(test, 0, 20) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) detector = CenterDetector(HourglassNet, args.input_size, num_class, dtype=dtype) #detector = CenterDetector(SimpleCNN, args.input_size, num_class) train_chain = CenterDetectorTrain(detector, 1, 0.1, 1) #train_chain = CenterDetectorTrain(detector, 1, 0, 0) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() train_chain.to_gpu(args.gpu) optimizer = Adam(alpha=1.25e-4) #optimizer = SGD() optimizer.setup(train_chain) updater = StandardUpdater(train_iter, optimizer, device=args.gpu) log_interval = 1, 'epoch' log_interval_mini = 500, 'iteration' trainer = Trainer(updater, (args.epoch, 'epoch'), out=f"result{args.gpu}") trainer.extend(extensions.LogReport(trigger=log_interval_mini)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'lr', 'main/loss', 'main/hm_loss', 'main/wh_loss', 'main/offset_loss', 'main/hm_mae', 'main/hm_pos_loss', 'main/hm_neg_loss', 'validation/main/map', ]), trigger=log_interval_mini) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(DetectionVOCEvaluator(test_iter, detector, use_07_metric=True, label_names=voc_bbox_label_names), trigger=(1, 'epoch')) trainer.extend(extensions.snapshot_object( detector, 'detector{.updater.epoch:03}.npz'), trigger=(1, 'epoch')) trainer.run()
def main(args): args = prepare_log_dir(args) # set dtype for training chainer.global_config.dtype = args.dtype train_dataset = BaseImageDataset( args.train_file, args.image_size, root=os.path.dirname(args.train_file), dtype=chainer.get_dtype(), ) validation_dataset = BaseImageDataset( args.val_file, args.image_size, root=os.path.dirname(args.val_file), dtype=chainer.get_dtype(), ) train_iter = MultiprocessIterator(train_dataset, batch_size=args.batch_size, shuffle=True) validation_iter = MultiprocessIterator(validation_dataset, batch_size=args.batch_size, repeat=False) net = HandwritingNet() model = L.Classifier(net, label_key='has_text') tensorboard_handle = SummaryWriter(log_dir=args.log_dir) optimizer = Adam(alpha=args.learning_rate) optimizer.setup(model) if args.save_gradient_information: optimizer.add_hook( TensorboardGradientPlotter(tensorboard_handle, args.log_interval), ) # log train information everytime we encouter a new epoch or args.log_interval iterations have been done log_interval_trigger = ( lambda trainer: (trainer.updater.is_new_epoch or trainer.updater.iteration % args.log_interval == 0) and trainer.updater.iteration > 0 ) updater = StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = Trainer(updater, (args.num_epoch, 'epoch'), out=args.log_dir) data_to_log = { 'log_dir': args.log_dir, 'image_size': args.image_size, # 'num_layers': args.num_layers, 'keep_aspect_ratio': train_dataset.keep_aspect_ratio, 'net': get_import_info(net), } for argument in filter(lambda x: not x.startswith('_'), dir(args)): data_to_log[argument] = getattr(args, argument) def backup_train_config(stats_cpu): iteration = stats_cpu.pop('iteration') epoch = stats_cpu.pop('epoch') elapsed_time = stats_cpu.pop('elapsed_time') for key, value in stats_cpu.items(): tensorboard_handle.add_scalar(key, value, iteration) if iteration == args.log_interval: stats_cpu.update(data_to_log) stats_cpu.update({ "epoch": epoch, "iteration": iteration, "elapsed_time": elapsed_time, }) trainer.extend( extensions.snapshot_object(net, net.__class__.__name__ + '_{.updater.iteration}.npz'), trigger=lambda trainer: trainer.updater.is_new_epoch or trainer.updater.iteration % args.snapshot_interval == 0, ) trainer.extend( extensions.snapshot(filename='trainer_snapshot', autoload=args.resume is not None), trigger=(args.snapshot_interval, 'iteration') ) trainer.extend( TensorboardEvaluator( validation_iter, model, device=args.gpu, tensorboard_handle=tensorboard_handle ), trigger=(args.test_interval, 'iteration'), ) logger = Logger( os.path.dirname(os.path.realpath(__file__)), args.log_dir, postprocess=backup_train_config, trigger=log_interval_trigger, exclusion_filters=['*logs*', '*.pyc', '__pycache__', '.git*'], resume=args.resume is not None, ) trainer.extend(logger, trigger=log_interval_trigger) trainer.extend( extensions.PrintReport( ['epoch', 'iteration', 'main/loss', 'main/accuracy', 'validation/main/accuracy'], log_report='Logger', ), trigger=log_interval_trigger, ) trainer.extend(extensions.ExponentialShift('alpha', 0.1, optimizer=optimizer), trigger=(10, 'epoch')) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run()
def main(args): random.seed(0) np.random.seed(0) if args.gpu >= 0: cuda.get_device_from_id(args.gpu).use() cuda.cupy.random.seed(0) dataset, id2ene = load_dataset(args.dataset, args.features, args.redirects) print(f'# of examples in dataset: {len(dataset)}') def batch2tensors(batch, device): xp = cuda.cupy if device >= 0 else np xf = xp.zeros((len(batch), args.n_feature), dtype='f') xe = xp.zeros((len(batch), args.embed_size), dtype='f') t = xp.zeros((len(batch), len(id2ene)), dtype='i') for i, item in enumerate(batch): for feature_id in item['feature_ids']: if feature_id < args.n_feature: xf[i, feature_id] = 1.0 if item['embedding']: xe[i] = xp.array(item['embedding'], dtype='f') for ene_id in item['ene_ids']: t[i, ene_id] = 1 x = xp.concatenate((xf, xe), axis=1) return x, t cv_datasets = get_cross_validation_datasets(dataset, args.cv) ys = [] ts = [] for split_idx, cv_dataset in enumerate(cv_datasets): print(f'cross validation ({split_idx + 1}/{len(cv_datasets)})') train, test = cv_dataset train_iter = SerialIterator(train, batch_size=args.batch) test_iter = SerialIterator(test, batch_size=args.batch, repeat=False, shuffle=False) model = ENEClassifier(in_size=args.n_feature + args.embed_size, hidden_size=args.hidden_size, out_size=len(id2ene)) if args.gpu >= 0: model.to_gpu(args.gpu) optimizer = optimizers.Adam() optimizer.setup(model) updater = StandardUpdater(train_iter, optimizer, converter=batch2tensors, device=args.gpu) trainer = Trainer(updater, (args.epoch, 'epoch'), out=args.out_dir) trainer.extend(extensions.LogReport()) trainer.extend( extensions.snapshot_object( model, filename='epoch_{.updater.epoch}.model')) trainer.extend( extensions.Evaluator(test_iter, model, converter=batch2tensors, device=args.gpu)) trainer.extend( extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time'])) trainer.extend(extensions.ProgressBar(update_interval=1)) trainer.run() test_iter.reset() for batch in test_iter: x, t = batch2tensors(batch, device=args.gpu) with chainer.using_config('train', False): y = model.predict(x) ys.append(y) ts.append(t) y_all = F.concat(ys, axis=0) t_all = F.concat(ts, axis=0) prediction_matrix = (y_all.data >= 0.5).astype('f') reference_matrix = (t_all.data == 1).astype('f') accuracy_matrix = prediction_matrix * reference_matrix eb_pred = prediction_matrix.sum( axis=1) # entity-based num. of predicted classes eb_ref = reference_matrix.sum( axis=1) # entity-based num. of reference classes eb_acc = accuracy_matrix.sum( axis=1) # entity-based num. of accurate classes eb_nopred = (eb_pred == 0.).astype('f') # for avoiding zero-division eb_precision = (eb_acc / (eb_pred + eb_nopred)).mean() eb_recall = (eb_acc / eb_ref).mean() eb_f1 = (2 * eb_acc / (eb_pred + eb_ref)).mean() cb_pred = prediction_matrix.sum( axis=0) # class-based num. of predicted examples cb_ref = reference_matrix.sum( axis=0) # class-based num. of reference examples cb_acc = accuracy_matrix.sum( axis=0) # class-based num. of accurate examples cb_nopred = (cb_pred == 0.).astype('f') # for avoiding zero-division cb_macro_precision = (cb_acc / (cb_pred + cb_nopred)).mean() cb_macro_recall = (cb_acc / cb_ref).mean() cb_macro_f1 = (2 * cb_acc / (cb_pred + cb_ref)).mean() cb_micro_precision = cb_acc.sum() / cb_pred.sum() cb_micro_recall = cb_acc.sum() / cb_ref.sum() cb_micro_f1 = (2 * cb_acc.sum()) / (cb_pred.sum() + cb_ref.sum()) print(f'Entity-based Precision: {float(eb_precision):.2%}') print(f'Entity-based Recall: {float(eb_recall):.2%}') print(f'Entity-based F1 score: {float(eb_f1):.2%}') print(f'Class-based macro Precision: {float(cb_macro_precision):.2%}') print(f'Class-based macro Recall: {float(cb_macro_recall):.2%}') print(f'Class-based macro F1 score: {float(cb_macro_f1):.2%}') print(f'Class-based micro Precision: {float(cb_micro_precision):.2%}') print(f'Class-based micro Recall: {float(cb_micro_recall):.2%}') print(f'Class-based micro F1 score: {float(cb_micro_f1):.2%}') print(f'writing out classification results') with open(Path(args.out_dir) / 'classification_result.json', 'w') as fo: for i, item in tqdm(enumerate(dataset)): title = item['title'] predicted_classes = [ id2ene[j] for j, v in enumerate(prediction_matrix[i]) if v == 1.0 ] reference_classes = [ id2ene[j] for j, v in enumerate(reference_matrix[i]) if v == 1.0 ] out = { 'title': title, 'prediction': predicted_classes, 'reference': reference_classes } print(json.dumps(out, ensure_ascii=False), file=fo)
def main(): # TODO: cleanup and move to conf or remove conf parser = argparse.ArgumentParser() parser.add_argument( "config", type=str, help= "Config file for Training params such as epochs, batch size, lr, etc.") parser.add_argument("model_name", type=str, help="The name under which the models will be saved") parser.add_argument( "dataset_dir", type=str, help="Directory where the images and the dataset description is stored" ) parser.add_argument( "train_path", type=str, help="path to JSON file containing train set information") parser.add_argument( "test_path", type=str, help="path to JSON file containing test set information") parser.add_argument("-rs", "--resnet-size", type=int, default="18", help="Size of the used ResNet model") parser.add_argument("-ld", "--log-dir", type=str, help="name of tensorboard logdir") parser.add_argument( "-ll", "--lossless", action="store_true", help="use lossless triplet loss instead of standard one") parser.add_argument( "-ce", "--ce-classifier", action="store_true", help="use a cross entropy classifier instead of triplet loss") parser.add_argument( "-llr", action="store_true", help="Evaluate triplets with log-likehood-ratios instead of kmeans/knn" ) parser.add_argument("-eo", "--eval-only", type=str, help="only evaluate the given model") args = parser.parse_args() ###################### INIT ############################ resnet_size = args.resnet_size base_model = PooledResNet(resnet_size) # parse config file plot_loss = True config = configparser.ConfigParser() config.read(args.config) batch_size = int(config["TRAINING"]["batch_size"]) epochs = int(config["TRAINING"]["epochs"]) lr = float(config["TRAINING"]["lr"]) gpu = config["TRAINING"]["gpu"] xp = cuda.cupy if int(gpu) >= 0 else np model_name = args.model_name # Init tensorboard writer if args.log_dir is not None: if args.eval_only is not None: log_dir = f"runs/{args.log_dir}_eval" else: log_dir = f"runs/{args.log_dir}" if os.path.exists(log_dir): user_input = input("Log dir not empty. Clear log dir? (y/N)") if user_input == "y": shutil.rmtree(log_dir) writer = SummaryWriter(log_dir) else: writer = SummaryWriter() log_dir = writer.logdir with open(os.path.join(writer.logdir, "args.log"), "w") as log_file: log_file.write(f"{' '.join(sys.argv[1:])}\n") shutil.copy(args.config, writer.logdir) print("MODEL_NAME:", model_name, "BATCH_SIZE:", str(batch_size), "EPOCHS:", str(epochs)) #################### Train and Save Model ######################################## if args.ce_classifier: train, test, classes = load_dataset(args) # convert labels from string to int label_map = {label: i for i, label in enumerate(classes)} train = [(sample, label_map[label]) for sample, label in train] test = [(sample, label_map[label]) for sample, label in test] train_iter = SerialIterator(train, batch_size, repeat=True, shuffle=True) test_iter = SerialIterator(test, batch_size, repeat=False, shuffle=False) model = CrossEntropyClassifier(base_model, len(classes)) if int(gpu) >= 0: backend.get_device(gpu).use() base_model.to_gpu() model.to_gpu() optimizer = optimizers.Adam(alpha=lr) optimizer.setup(model) updater = StandardUpdater(train_iter, optimizer, device=gpu) evaluator = CEEvaluator(test_iter, model, device=gpu) else: ### load dataset train_triplet, train_samples, train_labels, test_triplet, test_samples, test_labels = load_triplet_dataset( args) # Decide on triplet loss function; spoiler: lossless sucks if args.lossless: model = LosslessClassifier(base_model) else: model = StandardClassifier(base_model) ### Initialise triple loss model train_iter = TripletIterator(train_triplet, batch_size=batch_size, repeat=True, xp=xp) test_iter = TripletIterator(test_triplet, batch_size=batch_size, xp=xp) if int(gpu) >= 0: backend.get_device(gpu).use() base_model.to_gpu() model.to_gpu() optimizer = optimizers.Adam(alpha=lr) optimizer.setup(model) updater = triplet.Updater(train_iter, optimizer, device=gpu) evaluator = triplet.Evaluator(test_iter, model, device=gpu) if args.eval_only is None: trainer = get_trainer(updater, evaluator, epochs) if plot_loss: trainer.extend( extensions.PlotReport(["main/loss", "validation/main/loss"], "epoch", file_name=f"{model_name}_loss.png")) trainer.extend( extensions.snapshot(serializers.save_npz, filename=model_name + "_full_{0.updater.epoch:03d}.npz", target=model)) best_model_name = model_name + "_full_best.npz" trainer.extend(extensions.snapshot(serializers.save_npz, filename=best_model_name, target=model), trigger=triggers.BestValueTrigger( "validation/main/loss", lambda best, new: new < best)) if not args.ce_classifier: cluster_dir = os.path.join(writer.logdir, "cluster_imgs") os.makedirs(cluster_dir, exist_ok=True) trainer.extend(ClusterPlotter(base_model, test_labels, test_samples, batch_size, xp, cluster_dir), trigger=(1, "epoch")) # trainer.extend(VisualBackprop(test_triplet[0], test_labels[0], base_model, [["visual_backprop_anchors"]], xp), trigger=(1, "epoch")) # trainer.extend(VisualBackprop(test_triplet[2], test_labels[2], base_model, [["visual_backprop_anchors"]], xp), trigger=(1, "epoch")) trainer.run() # serializers.save_npz(os.path.join(writer.logdir, model_name + "_base.npz"), base_model) for file in glob.glob(f"result/{model_name}*"): shutil.move(file, writer.logdir) best_model_path = os.path.join(writer.logdir, best_model_name) else: best_model_path = args.eval_only #################### Evaluation ######################################## serializers.load_npz(best_model_path, model) if args.ce_classifier: metrics = evaluate_ce(model, test, batch_size, label_map, xp) elif args.llr: metrics = evaluate_triplet_with_llr(train_samples, train_labels, test_samples, test_labels, log_dir, model, batch_size, xp) else: metrics = evaluate_triplet(model, train_samples, train_labels, test_samples, test_labels, batch_size, writer, xp) with open(os.path.join(writer.logdir, "metrics.log"), "w") as log_file: json.dump(metrics, log_file, indent=4) print("Done") # sys.exit(0) os._exit(0)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--batchsize', type=int, default=4) parser.add_argument('--epoch', type=int, default=10) parser.add_argument('--mini', action="store_true") args = parser.parse_args() if hasattr(multiprocessing, 'set_start_method'): multiprocessing.set_start_method('forkserver') p = multiprocessing.Process() p.start() p.join() comm = chainermn.create_communicator('pure_nccl') print(comm.size) device = comm.intra_rank num_class = len(voc_bbox_label_names) data_augmentation_transform = DataAugmentationTransform(512) center_detection_transform = CenterDetectionTransform(512, num_class, 4) train = TransformDataset( ConcatenatedDataset(VOCBboxDataset(year='2007', split='trainval'), VOCBboxDataset(year='2012', split='trainval')), data_augmentation_transform) if comm.rank == 0: train = TransformDataset(train, center_detection_transform) if args.mini: train = datasets.SubDataset(train, 0, 100) else: train = None train = chainermn.scatter_dataset(train, comm, shuffle=True) train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize // comm.size, n_processes=2) if comm.rank == 0: test = VOCBboxDataset(year='2007', split='test', use_difficult=True, return_difficult=True) if args.mini: test = datasets.SubDataset(test, 0, 20) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) detector = CenterDetector(HourglassNet, 512, num_class) train_chain = CenterDetectorTrain(detector, 1, 0.1, 1, comm=comm) chainer.cuda.get_device_from_id(device).use() train_chain.to_gpu() optimizer = chainermn.create_multi_node_optimizer(Adam(amsgrad=True), comm) optimizer.setup(train_chain) updater = StandardUpdater(train_iter, optimizer, device=device) trainer = Trainer(updater, (args.epoch, 'epoch')) if comm.rank == 0: log_interval = 1, 'epoch' trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'lr', 'main/loss', 'main/hm_loss', 'main/wh_loss', 'main/offset_loss', 'validation/main/map', ]), trigger=log_interval) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(DetectionVOCEvaluator(test_iter, detector, use_07_metric=True, label_names=voc_bbox_label_names), trigger=(1, 'epoch')) trainer.extend(extensions.snapshot_object( detector, 'detector{.updator.epoch:03}.npz'), trigger=(1, 'epoch')) trainer.run()
def train(model_class, n_base_units, trained_model, no_obj_weight, data, result_dir, initial_batch_size=10, max_batch_size=1000, max_epoch=100): train_x, train_y, val_x, val_y = data max_class_id = 0 for objs in val_y: for obj in objs: max_class_id = max(max_class_id, obj[4]) n_classes = max_class_id + 1 class_weights = [1.0 for i in range(n_classes)] class_weights[0] = no_obj_weight train_dataset = YoloDataset(train_x, train_y, target_size=model_class.img_size, n_grid=model_class.n_grid, augment=True, class_weights=class_weights) test_dataset = YoloDataset(val_x, val_y, target_size=model_class.img_size, n_grid=model_class.n_grid, augment=False, class_weights=class_weights) model = model_class(n_classes, n_base_units) model.loss_calc = LossCalculator(n_classes, class_weights=class_weights) last_result_file = os.path.join(result_dir, 'best_loss.npz') if os.path.exists(last_result_file): try: chainer.serializers.load_npz(last_result_file, model) print('this training has done. resuse the result') return model except: pass if trained_model: print('copy params from trained model') copy_params(trained_model, model) optimizer = Adam() optimizer.setup(model) n_physical_cpu = int(math.ceil(multiprocessing.cpu_count() / 2)) train_iter = MultiprocessIterator(train_dataset, batch_size=initial_batch_size, n_prefetch=n_physical_cpu, n_processes=n_physical_cpu) test_iter = MultiprocessIterator(test_dataset, batch_size=initial_batch_size, shuffle=False, repeat=False, n_prefetch=n_physical_cpu, n_processes=n_physical_cpu) updater = StandardUpdater(train_iter, optimizer, device=0) stopper = triggers.EarlyStoppingTrigger(check_trigger=(1, 'epoch'), monitor="validation/main/loss", patients=10, mode="min", max_trigger=(max_epoch, "epoch")) trainer = Trainer(updater, stopper, out=result_dir) trainer.extend(extensions.dump_graph('main/loss')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.extend(extensions.Evaluator(test_iter, model, device=0)) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/cl_loss', 'validation/main/cl_loss', 'main/cl_acc', 'validation/main/cl_acc', 'main/pos_loss', 'validation/main/pos_loss', ])) trainer.extend(extensions.snapshot_object(model, 'best_loss.npz'), trigger=triggers.MinValueTrigger('validation/main/loss')) trainer.extend(extensions.snapshot_object(model, 'best_classification.npz'), trigger=triggers.MaxValueTrigger('validation/main/cl_acc')) trainer.extend( extensions.snapshot_object(model, 'best_position.npz'), trigger=triggers.MinValueTrigger('validation/main/pos_loss')) trainer.extend(extensions.snapshot_object(model, 'model_last.npz'), trigger=(1, 'epoch')) trainer.extend(AdaptiveBatchsizeIncrement(maxsize=max_batch_size), trigger=(1, 'epoch')) trainer.run() chainer.serializers.load_npz(os.path.join(result_dir, 'best_loss.npz'), model) return model