def train(args): data_folder = args.get("data_folder") save_folder = args.get("save_folder") image_size = args.get("image_size") max_epoch = args.get("max_epoch") save_epoch = args.get("save_epoch") or max_epoch // 10 # Scale lr and steps_per_epoch accordingly. # Make sure the total number of gradient evaluations is consistent. n_gpu = args.get("n_gpu") or 1 batch_size = args.get("batch_size") or BATCH equi_batch_size = max(n_gpu, 1) * batch_size lr = args.get("lr") or LR lr *= equi_batch_size steps_per_epoch = args.get("steps_per_epoch") or 1000 steps_per_epoch /= equi_batch_size image_steps = args.get("image_steps") or steps_per_epoch // 10 scalar_steps = args.get("scalar_steps") if scalar_steps > 0: scalar_steps = max(scalar_steps // equi_batch_size, 1) else: scalar_steps = 0 # merge scalar summary every epoch # lr starts decreasing at half of max epoch start_dec_epoch = max_epoch // 2 # stops when lr is 0.01 of its initial value end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01) # adjust noise input range according to the input act zmin, zmax = (0, 1) if args.get("act_input") == "identity" else (-1, 1) if save_folder == None: logger.auto_set_dir() else: logger.set_logger_dir(save_folder) df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax, batch=batch_size) df = PrintData(df) data = QueueInput(df) SynTexTrainer(data, Style2PO(args), n_gpu).train_with_defaults( callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch), PeriodicTrigger(ModelSaver(), every_k_epochs=end_epoch), # save model at last ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr), (max_epoch, 0)], interp="linear"), PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=max(1, max_epoch // 100)), #MergeAllSummaries(period=scalar_steps), # scalar only, slowdown in training, use TCMalloc MergeAllSummaries(period=image_steps, key="image_summaries"), MergeAllSummaries(key="acti_summaries"), ], max_epoch=end_epoch, steps_per_epoch=steps_per_epoch, session_init=None)
def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) from tensorpack.callbacks import CometMLMonitor trainer = SeparateGANTrainer( model=self.model, input_queue=input_queue, g_period=6, ) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else 'd' # logger.set_logger_dir(self.log_dir, action=action) callbacks = [] monitors = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) callbacks.append(MergeAllSummaries(period=10)) if self.experiment is not None: monitors.append(CometMLMonitor(experiment=self.experiment)) trainer.train_with_defaults(callbacks=callbacks, monitors=monitors, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) if self.trainer == 'GANTrainer': trainer = GANTrainer(model=self.model, input_queue=input_queue) elif self.trainer == 'SeparateGANTrainer': trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) else: raise ValueError( 'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer') # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else None logger.set_logger_dir(self.log_dir, action=action) callbacks = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
def get_config( files_list, input_names=["state_1", "state_2"], output_names=["Qvalue_1", "Qvalue_2"], agents=2, ): """This is only used during training.""" expreplay = ExpReplay( predictor_io_names=(input_names, output_names), player=get_player(task="train", files_list=files_list, agents=agents), state_shape=IMAGE_SIZE, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, init_memory_size=INIT_MEMORY_SIZE, init_exploration=1.0, update_frequency=UPDATE_FREQ, history_len=FRAME_HISTORY, agents=agents, ) return TrainConfig( # dataflow=expreplay, data=QueueInput(expreplay), model=Model(agents=agents), callbacks=[ ModelSaver(), PeriodicTrigger( RunOp(DQNModel.update_target_param, verbose=True), # update target network every 10k steps every_k_steps=10000 // UPDATE_FREQ, ), expreplay, ScheduledHyperParamSetter("learning_rate", [(60, 4e-4), (100, 2e-4)]), ScheduledHyperParamSetter( ObjAttrParam(expreplay, "exploration"), # 1->0.1 in the first million steps [(0, 1), (10, 0.1), (320, 0.01)], interp="linear", ), PeriodicTrigger( Evaluator( nr_eval=EVAL_EPISODE, input_names=input_names, output_names=output_names, files_list=files_list, get_player_fn=get_player, agents=agents, ), every_k_epochs=EPOCHS_PER_EVAL, ), HumanHyperParamSetter("learning_rate"), ], steps_per_epoch=STEPS_PER_EPOCH, max_epoch=1000, )
def get_config(): """This is only used during training.""" expreplay = ExpReplay(predictor_io_names=(['state'], ['Qvalue']), player=get_player(directory=data_dir, task='train', files_list=train_data_fpaths), state_shape=OBSERVATION_DIMS, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, init_memory_size=INIT_MEMORY_SIZE, init_exploration=1.0, update_frequency=UPDATE_FREQ, frame_history_len=FRAME_HISTORY) return TrainConfig( # dataflow=expreplay, data=QueueInput(expreplay), model=Model(), callbacks=[ # TODO: periodically save videos ModelSaver(checkpoint_dir="model_checkpoints", keep_checkpoint_every_n_hours=0.25, max_to_keep=1000), # TODO: og was just ModelSaver() PeriodicTrigger( RunOp(DQNModel.update_target_param, verbose=True), # update target network every 10k/freq steps every_k_steps=10000 // UPDATE_FREQ), # expreplay, ScheduledHyperParamSetter('learning_rate', [(60, 4e-4), (100, 2e-4)]), ScheduledHyperParamSetter( ObjAttrParam(expreplay, 'exploration'), # 1->0.1 in the first 10M steps [(0, 1), (100, 0.1), (120, 0.01)], interp='linear'), PeriodicTrigger( # runs exprelay._trigger() expreplay, every_k_steps=5000), PeriodicTrigger( # eval_model_multithread(pred, EVAL_EPISODE, get_player) Evaluator(nr_eval=EVAL_EPISODE, input_names=['state'], output_names=['Qvalue'], directory=data_dir, files_list=test_data_fpaths, get_player_fn=get_player), every_k_steps=10000 // UPDATE_FREQ), HumanHyperParamSetter('learning_rate'), ], steps_per_epoch=STEPS_PER_EPOCH, max_epoch=NUM_EPOCHS, )
def train_net(net, session_init, batch_size, num_epochs, train_dataflow, val_dataflow): num_towers = max(get_num_gpu(), 1) batch_per_tower = batch_size // num_towers logger.info("Running on {} towers. Batch size per tower: {}".format(num_towers, batch_per_tower)) num_training_samples = 1281167 step_size = num_training_samples // batch_size max_iter = (num_epochs - 1) * step_size callbacks = [ ModelSaver(), ScheduledHyperParamSetter( 'learning_rate', [(0, 0.5), (max_iter, 0)], interp='linear', step_based=True), EstimatedTimeLeft()] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if num_towers == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner( input=QueueInput(val_dataflow), infs=infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( input=val_dataflow, infs=infs, gpus=list(range(num_towers)))) config = TrainConfig( dataflow=train_dataflow, model=net, callbacks=callbacks, session_init=session_init, steps_per_epoch=step_size, max_epoch=num_epochs) launch_train_with_config( config=config, trainer=SyncMultiGPUTrainerParameterServer(num_towers))
def get_config(): """This is only used during training.""" expreplay = ExpReplay(predictor_io_names=(['state'], ['Qvalue']), player=get_player(directory=data_dir, task='train', files_list=train_list), state_shape=IMAGE_SIZE, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, init_memory_size=INIT_MEMORY_SIZE, init_exploration=1.0, update_frequency=UPDATE_FREQ, history_len=FRAME_HISTORY) return TrainConfig( # dataflow=expreplay, data=QueueInput(expreplay), model=Model(), callbacks=[ ModelSaver(), PeriodicTrigger( RunOp(DQNModel.update_target_param, verbose=True), # update target network every 10k steps every_k_steps=10000 // UPDATE_FREQ), expreplay, ScheduledHyperParamSetter('learning_rate', [(60, 4e-4), (100, 2e-4)]), ScheduledHyperParamSetter( ObjAttrParam(expreplay, 'exploration'), # 1->0.1 in the first million steps [(0, 1), (10, 0.1), (320, 0.01)], interp='linear'), PeriodicTrigger(Evaluator(nr_eval=EVAL_EPISODE, input_names=['state'], output_names=['Qvalue'], directory=data_dir, files_list=test_list, get_player_fn=get_player), every_k_epochs=EPOCHS_PER_EVAL), HumanHyperParamSetter('learning_rate'), ], steps_per_epoch=STEPS_PER_EPOCH, max_epoch=1000, )
def get_config(model, conf): nr_tower = max(get_nr_gpu(), 1) batch = conf.batch if conf.fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data(conf.data_dir, 'train', batch) dataset_val = get_data(conf.data_dir, 'val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(45, 1e-2), (60, 1e-3), (65, 1e-4), (70, 1e-5), (75, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=80, nr_tower=nr_tower)
def get_config(files_list, data_type, trainable_variables): """This is only used during training.""" expreplay = ExpReplay( predictor_io_names=(['state'], ['Qvalue']), player=get_player(task='train', files_list=files_list, data_type=data_type), state_shape=IMAGE_SIZE, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, init_memory_size=INIT_MEMORY_SIZE, init_exploration=0.8, #0.0 ############################################################################### # HITL UPDATE update_frequency=INIT_UPDATE_FREQ, ############################################################################### history_len=FRAME_HISTORY, arg_type=data_type) return TrainConfig( # dataflow=expreplay, data=QueueInput(expreplay), model=Model(IMAGE_SIZE, FRAME_HISTORY, METHOD, NUM_ACTIONS, GAMMA, trainable_variables), callbacks=[ ModelSaver(), PeriodicTrigger( RunOp(DQNModel.update_target_param, verbose=True), # update target network every 10k steps every_k_steps=10000 // UPDATE_FREQ), expreplay, ScheduledHyperParamSetter('learning_rate', [(60, 4e-4), (100, 2e-4)]), ScheduledHyperParamSetter( ObjAttrParam(expreplay, 'exploration'), # 1->0.1 in the first million steps [(0, 0.8), (1000000, 0.1), (32000000, 0.01)], interp='linear', step_based=True), ############################################################################### # HITL UPDATE # Here the number of steps taken in the environment is increased from 0, during # the pretraining phase, to 4 to allow the agent to take 4 steps in the env # between each TD update. ScheduledHyperParamSetter(ObjAttrParam(expreplay, 'update_frequency'), [(0, INIT_UPDATE_FREQ), (NUM_PRETRAIN, UPDATE_FREQ)], interp=None, step_based=True), ############################################################################### PeriodicTrigger(Evaluator(nr_eval=EVAL_EPISODE, input_names=['state'], output_names=['Qvalue'], files_list=files_list, data_type=data_type, get_player_fn=get_player), every_k_steps=STEPS_PER_EVAL), HumanHyperParamSetter('learning_rate'), ], steps_per_epoch=STEPS_PER_EPOCH, max_epoch=MAX_EPOCHS, )
start_dec_epoch = max_epoch // 2 # stops when lr is 0.01 of its initial value end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01) # adjust noise input range according to the input act zmin, zmax = (0, 1) if args.get("act") == "identity" else (-1, 1) if save_folder == None: logger.auto_set_dir() else: logger.set_logger_dir(save_folder) df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax) df = PrintData(df) data = QueueInput(df) SynTexTrainer(data, AdaptiveSynTex(args), n_gpu).train_with_defaults( callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch), PeriodicTrigger(ModelSaver(), every_k_epochs=end_epoch), # save model at last ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr), (max_epoch, 0)], interp="linear"), #PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=10), MergeAllSummaries(period=scalar_steps), # scalar only MergeAllSummaries(period=image_steps, key="image_summaries"), ], max_epoch=end_epoch, steps_per_epoch=steps_per_epoch, session_init=None)
vlen, nviews = Ppy.shape[-1], Ppy.shape[0] os.environ['CUDA_VISIBLE_DEVICES'] = get_visible_device_list(3) global_step = get_global_step_var() # set logger directory for checkpoints, etc logger.set_logger_dir(args.logdir, action='k') steps_per_epoch = cfg.EPOCH_STEPS model = Model(vlen, nviews) # config.gpu_options.allow_growth = True traincfg = TrainConfig( model=model, data=QueueInput(ProjDataFlow(Ppy)), callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=5), PeriodicTrigger(VolumeSaver(model), every_k_epochs=5), # prevent learning in the first epoch # MemInitHyperParamSetter('learning_rate_mask',(0,1)), # controls learning rate as a function of epoch HyperParamSetterWithFunc('learning_rate', learning_rate_fun), # GraphProfiler() # PeakMemoryTracker() # GPUUtilizationTracker(), ], steps_per_epoch=steps_per_epoch, max_epoch=200000, # first time load model from checkpoint and reset GRU state session_init=ChainInit([TryResumeTraining()]), #,ResetInit(model)]) # session_config=tf.ConfigProto(log_device_placement=True) #config_gpus(1) )
def train(checkpoint_dir, model_name, dataset, num_epochs, quant_type, batch_size_per_gpu, lr=None, post_quantize_only=False): train_data, test_data, (img_shape, label_shape) = datasets.DATASETS[dataset]() num_gpus = max(gpu.get_num_gpu(), 1) effective_batch_size = batch_size_per_gpu * num_gpus train_data = BatchData(train_data, batch_size_per_gpu) test_data = BatchData(test_data, batch_size_per_gpu, remainder=True) steps_per_epoch = len(train_data) // num_gpus if lr: if isinstance(lr, str): lr = ast.literal_eval(lr) if isinstance(lr, float): lr_schedule = [(0, lr)] else: lr_schedule = lr else: lr_schedule = [(0, 0.005), (8, 0.1), (25, 0.005), (30, 0)] if num_epochs is None: num_epochs = lr_schedule[-1][0] if post_quantize_only: start_quantising_at_epoch = 0 else: start_quantising_at_epoch = lr_schedule[-2][0] if len( lr_schedule) > 1 else max(0, num_epochs - 5) logger.info(f"Training with LR schedule: {str(lr_schedule)}") logger.info(f"Quantising at epoch {start_quantising_at_epoch}") # train_data = FakeData([(batch_size_per_gpu,) + img_shape, (batch_size_per_gpu, ) + label_shape]) model_func, input_spec, output_spec = get_model_func( "train", model_name, quant_type, img_shape, num_classes=label_shape[0], quant_delay=steps_per_epoch * start_quantising_at_epoch) target_spec = [ tf.TensorSpec(t.shape, t.dtype, name=t.name.split("/")[-1] + "_target") for t in output_spec ] model = KerasModel(get_model=model_func, input_signature=input_spec, target_signature=target_spec, input=train_data, trainer=SyncMultiGPUTrainerParameterServer( num_gpus, ps_device='gpu')) lr = tf.get_variable('learning_rate', initializer=lr_schedule[0][1], trainable=False) tf.summary.scalar('learning_rate-summary', lr) model.compile(optimizer=tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9), loss="categorical_crossentropy", metrics=["categorical_accuracy"]) model.fit(steps_per_epoch=steps_per_epoch, max_epoch=num_epochs, callbacks=[ ModelSaver(max_to_keep=1, checkpoint_dir=checkpoint_dir), DataParallelInferenceRunner( test_data, ScalarStats(model._stats_to_inference), num_gpus), ScheduledHyperParamSetter('learning_rate', lr_schedule, interp="linear"), StatMonitorParamSetter('learning_rate', 'validation_categorical_accuracy', lambda x: x / 2, threshold=0.001, last_k=10, reverse=True) ], session_init=SaverRestore(checkpoint_dir + "/checkpoint") if post_quantize_only else None)
if save_dir is None: logger.auto_set_dir() else: logger.set_logger_dir(save_dir) dataset_train = get_data('train') dataset_test = get_data('test') config = TrainConfig( model=CifarResNet(n=NUM_UNITS, mult_decay=mult_decay, lr_init=lr_base * 0.1), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner( dataset_test, [ScalarStats('cost'), ClassificationError('wrong_vector')]), ScheduledHyperParamSetter('learning_rate', [(1, lr_base), (82, lr_base * 0.1), (123, lr_base * 0.01), (164, lr_base * 0.002)]) ], max_epoch=200, session_init=SmartInit(args.load), ) num_gpu = max(get_num_gpu(), 1) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
steps_per_epoch = args.get("steps_per_epoch", 1000) / max(n_gpu, 1) # lr starts decreasing at half of max epoch start_dec_epoch = max_epoch // 2 # stops when lr is 0.01 of its initial value end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01) if save_folder == None: logger.auto_set_dir() else: logger.set_logger_dir(save_folder) df = get_data(data_folder, image_size) df = PrintData(df) data = QueueInput(df) SynTexTrainer(data, ProgressiveSynTex(args), n_gpu).train_with_defaults( callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch), PeriodicTrigger(ModelSaver(), every_k_epochs=end_epoch), # save model at last ScheduledHyperParamSetter( 'learning_rate', [(start_dec_epoch, lr), (max_epoch, 0)], interp="linear"), #PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=10), MergeAllSummaries(period=10), # scalar only MergeAllSummaries(period=image_steps, key="image_summaries"), ], max_epoch= end_epoch, steps_per_epoch=steps_per_epoch, session_init=None )