def load_params(self, args): """ Load arguments """ # Tile Size self.tile_size = int(args['--tile_size']) # Paths self.model_path = args['--model'] # get absolute path for input directory - otherwise may give error in JP2Image.m self.input_dir = os.path.abspath(args['--input_dir']) self.output_dir = args['--output_dir'] rm_n_mkdir(self.output_dir) self.logging_dir = args['--logging_dir'] logging_dir = self.output_dir + '/' + self.logging_dir rm_n_mkdir(logging_dir) logger.set_logger_dir(logging_dir) self.logging_level = args['--logging_level'] #TODO: this depends on tensorflow getting first crack at the logger (and adding the defailt std_out handler with INFO-level logging) logger._logger.handlers[0].setLevel(self.logging_level) logger._logger.setLevel(self.logging_level) # Processing self.batch_size = int(args['--batch_size']) # Below specific to WSI processing self.return_masks = args['--return_masks'] self.tiss_lvl = 3 # default WSI level at which perform tissue segmentation print(f"'--tissue_level' provided:{args['--tissue_level']}") try: if args['--tissue_level'] and int(args['--tissue_level']) > 3: self.tiss_lvl = int(args['--tissue_level']) except: pass
def train(args): data_folder = args.get("data_folder") save_folder = args.get("save_folder") image_size = args.get("image_size") max_epoch = args.get("max_epoch") save_epoch = args.get("save_epoch") or max_epoch // 10 # Scale lr and steps_per_epoch accordingly. # Make sure the total number of gradient evaluations is consistent. n_gpu = args.get("n_gpu") or 1 batch_size = args.get("batch_size") or BATCH equi_batch_size = max(n_gpu, 1) * batch_size lr = args.get("lr") or LR lr *= equi_batch_size steps_per_epoch = args.get("steps_per_epoch") or 1000 steps_per_epoch /= equi_batch_size image_steps = args.get("image_steps") or steps_per_epoch // 10 scalar_steps = args.get("scalar_steps") if scalar_steps > 0: scalar_steps = max(scalar_steps // equi_batch_size, 1) else: scalar_steps = 0 # merge scalar summary every epoch # lr starts decreasing at half of max epoch start_dec_epoch = max_epoch // 2 # stops when lr is 0.01 of its initial value end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01) # adjust noise input range according to the input act zmin, zmax = (0, 1) if args.get("act_input") == "identity" else (-1, 1) if save_folder == None: logger.auto_set_dir() else: logger.set_logger_dir(save_folder) df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax, batch=batch_size) df = PrintData(df) data = QueueInput(df) SynTexTrainer(data, Style2PO(args), n_gpu).train_with_defaults( callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch), PeriodicTrigger(ModelSaver(), every_k_epochs=end_epoch), # save model at last ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr), (max_epoch, 0)], interp="linear"), PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=max(1, max_epoch // 100)), #MergeAllSummaries(period=scalar_steps), # scalar only, slowdown in training, use TCMalloc MergeAllSummaries(period=image_steps, key="image_summaries"), MergeAllSummaries(key="acti_summaries"), ], max_epoch=end_epoch, steps_per_epoch=steps_per_epoch, session_init=None)
def get_config(): log_dir = 'train_log/cifar10-bc-k[%d]-path[%d]-[%d-%d-%d-%d]-' % (int( args.k), int(args.path), int(args.block1), int( args.block2), int(args.block3), int(args.block4)) logger.set_logger_dir(log_dir, action='n') # prepare dataset dataset_train = get_data('train') dataset_test = get_data('test') #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) # config = tf.ConfigProto(allow_soft_placement = True,gpu_options=gpu_options) # config.gpu_options.allow_growth=True # config.gpu_options.per_process_gpu_memory_fraction = 0.4 callbacks = [] callbacks.append(ModelSaver()) nr_tower = len(args.gpu.split(',')) print('nr_tower = {}'.format(nr_tower)) steps_per_epoch = dataset_train.size() // nr_tower if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append( InferenceRunner(dataset_test, [ScalarStats('cost'), ClassificationError()])) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner( dataset_test, [ScalarStats('cost'), ClassificationError()], list(range(nr_tower)))) #callbacks.append(InferenceRunner(dataset_test, #[ScalarStats('cost',prefix="testing"), ClassificationError(summary_name='validataion_error1')])) # callbacks.append(DataParallelInferenceRunner( # dataset_test, [ScalarStats('cost'), ClassificationError()], list(range(nr_tower)))) callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (args.drop_1, 0.01), (args.drop_2, 0.001), (args.drop_3, 0.0002)])) return TrainConfig( dataflow=dataset_train, # callbacks=[ # ModelSaver(), # InferenceRunner(dataset_test, # [ScalarStats('cost'), ClassificationError()]), # ScheduledHyperParamSetter('learning_rate', # [(1, 0.1), (args.drop_1, 0.01), (args.drop_2, 0.001),(args.drop_2, 0.0001)]) # ], callbacks=callbacks, model=Model(args.k, args.path, args.block1, args.block2, args.block3, args.block4), steps_per_epoch=steps_per_epoch, max_epoch=args.max_epoch, #session_config = config, nr_tower=nr_tower, )
def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) if self.trainer == 'GANTrainer': trainer = GANTrainer(model=self.model, input_queue=input_queue) elif self.trainer == 'SeparateGANTrainer': trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) else: raise ValueError( 'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer') # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else None logger.set_logger_dir(self.log_dir, action=action) callbacks = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
def _init_model(self): logger.set_logger_dir("/tmp/test_log/", 'd') from dataset import DetectionDataset from train import ResNetFPNTrackModel # init tensorpack model cfg.freeze(False) model = ResNetFPNTrackModel() DetectionDataset( ) # initialize the config with information from our dataset finalize_configs(is_training=False) return model
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt['train_batch_size'], mode='train') valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt['model_flags'] model = self.get_model()(**model_flags) ###### callbacks = [ ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None), ] for param_name, param_info in opt['manual_parameters'].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append( ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) if self.model_mode == 'seg_gland': callbacks.append(MaxSaver('valid_dice_obj')) elif self.model_mode == 'seg_nuc': callbacks.append(MaxSaver('valid_dice_np')) else: callbacks.append(MaxSaver('valid_auc')) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt['nr_epochs'], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
def __init__(self, name, need_network=True, need_img=True, model="best"): super().__init__(name=name, is_deterministic=True) self._resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) self._prev_box = None self._ff_gt_feats = None self._need_network = need_network self._need_img = need_img self._rotated_bbox = None if need_network: logger.set_logger_dir( "/tmp/test_log_/" + str(random.randint(0, 10000)), 'd') if model == "best": load = "train_log/hard_mining3/model-1360500" elif model == "nohardexamples": load = "train_log/condrcnn_all_2gpu_lrreduce2/model-1200500" elif model == "newrpn": load = "train_log/newrpn1/model" elif model == "resnet50_nohardexamples": load = "train_log/condrcnn_all_resnet50/model-1200500" cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3] elif model == "resnet50": load = "train_log/hard_mining3_resnet50/model-1360500" cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3] elif model == "gotonly": load = "train_log/hard_mining3_onlygot/model-1361000" elif model.startswith("checkpoint:"): load = model.replace("checkpoint:", "") else: assert False, ("unknown model", model) from dataset import DetectionDataset # init tensorpack model # cfg.freeze(False) DetectionDataset( ) # initialize the config with information from our dataset cfg.EXTRACT_GT_FEATURES = True cfg.MODE_TRACK = False extract_model = ResNetFPNModel() extract_ff_feats_cfg = PredictConfig( model=extract_model, session_init=get_model_loader(load), input_names=['image', 'roi_boxes'], output_names=['rpn/feature']) finalize_configs(is_training=False) self._extract_func = OfflinePredictor(extract_ff_feats_cfg) cfg.EXTRACT_GT_FEATURES = False cfg.MODE_TRACK = True cfg.USE_PRECOMPUTED_REF_FEATURES = True self._pred_func = self._make_pred_func(load)
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt["train_batch_size"], mode="train") valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid") ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt["model_flags"] model = self.get_model()(**model_flags) ###### callbacks = [ # ModelSaver(max_to_keep=20), # TODO dynamic this ModelSaver(max_to_keep=opt["nr_epochs"]), # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'), ] for param_name, param_info in opt["manual_parameters"].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus))) ) callbacks.append(MaxSaver("valid_dice")) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt["nr_epochs"], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph() # remove the entire graph in case of multiple runs # TODO: save return
def run(model): instance = Model(model, model.conf.data_format) if not model.conf.is_train: batch = 64 dataset = get_data(model.conf.data_dir, 'val', batch) eval_on_ILSVRC12( instance, get_model_loader(model.conf.logdir + '/' + model.conf.test_step), dataset) else: logger.set_logger_dir(os.path.join(model.conf.logdir)) config = get_config(instance, model.conf) if model.conf.reload_step: config.session_init = get_model_loader(model.conf.logdir + '/' + model.conf.reload_step) trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(mode='train') valid_datagen = self.get_datagen(mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) callbacks = [ ModelSaver(max_to_keep=200), ScheduledHyperParamSetter('learning_rate', self.lr_sched), ] ###### # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) ###### steps_per_epoch = train_datagen.size() // nr_gpus MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST config = TrainConfig( model=MODEL_MAKER(freeze), callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=self.nr_epochs, ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
model = Model() if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) elif args.flops: # manually build the graph with batch=1 input_desc = [ InputDesc(tf.float32, [1, 224, 224, 3], 'input'), InputDesc(tf.int32, [1], 'label') ] input = PlaceholderInput() input.setup(input_desc) with TowerContext('', is_training=True): model.build_graph(*input.get_input_tensors()) tf.profiler.profile( tf.get_default_graph(), cmd='op', options=tf.profiler.ProfileOptionBuilder.float_operation()) else: logger.set_logger_dir(os.path.join('train_log', 'shufflenet')) nr_tower = max(get_nr_gpu(), 1) config = get_config(model, nr_tower) if args.load: config.session_init = get_model_loader(args.load) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
help='variants of resnet to use', default='resnet') parser.add_argument('--lp', choices=['2', 'inf']) args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.mode == 'se': assert args.depth >= 50 nr_tower = max(get_nr_gpu(), 1) batch_size = TOTAL_BATCH_SIZE // nr_tower model = Model(args.image_size, args.depth, args.data_format, args.mode) if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch, args.image_size) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: logger.set_logger_dir(args.checkpoint_dir) config = get_config(model, args.checkpoint_dir, args.image_size, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
model = Model() if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) elif args.flops: # manually build the graph with batch=1 input_desc = [ InputDesc(tf.float32, [1, 224, 224, 3], 'input'), InputDesc(tf.int32, [1], 'label') ] input = PlaceholderInput() input.setup(input_desc) with TowerContext('', is_training=True): model.build_graph(*input.get_input_tensors()) tf.profiler.profile( tf.get_default_graph(), cmd='op', options=tf.profiler.ProfileOptionBuilder.float_operation()) else: logger.set_logger_dir( os.path.join('train_log', 'shufflenet')) nr_tower = max(get_nr_gpu(), 1) config = get_config(model, nr_tower) if args.load: config.session_init = get_model_loader(args.load) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
type=int, default=18, choices=[18, 34, 50, 101, 152]) parser.add_argument('--eval', action='store_true') parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.mode == 'se': assert args.depth >= 50 model = Model(args.depth, args.data_format, args.mode) if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-resnet-d' + str(args.depth))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
viz=False, saveGif=args.saveGif, saveVideo=args.saveVideo, task='play'), pred, num_validation_files) # run episodes in parallel and evaluate pretrained model elif args.task == 'eval': play_n_episodes( get_player(directory=data_dir, files_list=eval_list, viz=False, saveGif=args.saveGif, saveVideo=args.saveVideo, task='eval'), pred, num_files) else: # train model logger.set_logger_dir(logger_dir) config = get_config() if args.load: # resume training from a saved checkpoint config.session_init = get_model_loader(args.load) launch_train_with_config(config, SimpleTrainer()) # # FOR PROFILING # NUM_EPOCHS = 2 # import cProfile # import pstats # profiler = cProfile.Profile() # profiler.runctx('launch_train_with_config(config, SimpleTrainer())', globals(), locals()) # # stats = pstats.Stats(profiler) # stats.strip_dirs()
) parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') parser.add_argument('--log_dir', type=str, default='') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.mode) model.data_format = args.data_format if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: log_foder = '/data0/wangguangrun/log_acnt/imagenet-resnet-%s' % ( args.log_dir) logger.set_logger_dir(os.path.join(log_foder)) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
eval_on_AVA2012(model, get_model_loader(args.load), ds, args.repeat_times) else: # add @ 20171128: the strategy of parameter initalization with a ImageNet pre-trained model # should be recorded within the name string of the directory of training log initial_strategy = '_fromScratch' if args.load: initial_strategy = '_preTrainedModel' elif args.load_npy: initial_strategy = '_preTrainedImageNetModel' # change @ 20180705 # introduce An for the AESTHETIC_LEVEL is set to n logger.set_logger_dir('./train_log/AVA2012{6}-{0}-d{1}-{2}-{3}{4}{5}_LRT3'.format(args.mode, args.depth, \ args.crop_method_TR, args.crop_method_TS, initial_strategy, \ '' if args.JensenFactor == 0.0 else '_JE{}'.format(args.JensenFactor), \ '' if args.aesthetic_level == AESTHETIC_LEVEL else '-A{}'.format(args.aesthetic_level))) config = get_config(model, args.data, args.crop_method_TR, args.color_augmentation, args.crop_method_TS) # load pre-trained model if it exists # TODO: layer-cascade or freeze-layer ? rely-backpropagation ? # layer-wise adaptive scale rate ? if args.load: print('--> initialize the session with the checkpoint file %s', args.load) config.session_init = get_model_loader(args.load) elif args.load_npy: print('--> initalize the session with the npy file %s', args.load) # add @ 20171128: adopt the ImageNet pre-trained model for initialization purpose # load params from npy file, convert them into the desired formation,
parser.add_argument('--data_format', help='specify NCHW or NHWC', type=str, default='NHWC') parser.add_argument('--eval', action='store_true') parser.add_argument( '--batch_size_per_gpu', default=32, type=int, help= 'total batch size. 32 per GPU gives best accuracy, higher values should be similarly good' ) args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.data_format) if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: logger.set_logger_dir(os.path.join('train_log', 'vgg')) config = get_config(model) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
screen_dims=IMAGE_SIZE, spacing=SPACING) NUM_ACTIONS = init_player.action_space.n num_validation_files = init_player.files.num_files if args.task != 'train': assert args.load is not None pred = OfflinePredictor(PredictConfig( model=Model(), session_init=get_model_loader(args.load), input_names=['state'], output_names=['Qvalue'])) if args.task == 'play': t0 = time.time() play_n_episodes(get_player(directory=data_dir, files_list=test_list, viz=0.01, saveGif=args.saveGif, saveVideo=args.saveVideo), pred, num_validation_files) t1 = time.time() print(t1-t0) elif args.task == 'eval': eval_model_multithread(pred, EVAL_EPISODE, get_player) else: logger.set_logger_dir(logger_dir) # todo: variable log dir config = get_config() if args.load: config.session_init = get_model_loader(args.load) launch_train_with_config(config, SimpleTrainer())
type=float, default=0.1) parser.add_argument('--log_path', help='path of log', type=str, default='') parser.add_argument('--action', help='action type', type=str, default='') args = parser.parse_args() TOTAL_BATCH_SIZE = args.batch_size imagenet_utils.DEFAULT_IMAGE_SHAPE = args.input_size if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.data_format, args.mode, args.wd, args.qw, args.qa, learning_rate=args.lr, data_aug=args.data_aug) if args.eval: batch = 100 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.log_path == '': logger.set_logger_dir( os.path.join('train_log', 'imagenet_resnet_d' + str(args.depth) + args.logdir_id), action=None if args.action == '' else args.action) else: logger.set_logger_dir(args.log_path + '/train_log/' + args.logdir_id, action=None if args.action == '' else args.action) config = get_config(model, fake=args.fake, data_aug=args.data_aug) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
action='store_true') parser.add_argument('--data_format', help='specify NCHW or NHWC', type=str, default='NCHW') parser.add_argument('-d', '--depth', help='resnet depth', type=int, default=18, choices=[18, 34, 50, 101]) parser.add_argument('--eval', action='store_true') args = parser.parse_args() DEPTH = args.depth if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.eval: BATCH_SIZE = 128 # something that can run on one gpu ds = get_data('val') eval_on_ILSVRC12(Model(), args.load, ds) sys.exit() logger.set_logger_dir( os.path.join('train_log', 'imagenet-resnet-d' + str(DEPTH))) config = get_config(fake=args.fake, data_format=args.data_format) if args.load: config.session_init = SaverRestore(args.load) SyncMultiGPUTrainerParameterServer(config).train()
parser = argparse.ArgumentParser() parser.add_argument('--logdir', help='logdir', default='') args = parser.parse_args() # P_py = np.load('/jasper/models/gp140/P_py.npy') Ppy = np.load('/jasper/models/BetaGal/betagal1.5_projections.npy') Ppy = Ppy[0] # leave only first symmetric unit vlen, nviews = Ppy.shape[-1], Ppy.shape[0] os.environ['CUDA_VISIBLE_DEVICES'] = get_visible_device_list(3) global_step = get_global_step_var() # set logger directory for checkpoints, etc logger.set_logger_dir(args.logdir, action='k') steps_per_epoch = cfg.EPOCH_STEPS model = Model(vlen, nviews) # config.gpu_options.allow_growth = True traincfg = TrainConfig( model=model, data=QueueInput(ProjDataFlow(Ppy)), callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=5), PeriodicTrigger(VolumeSaver(model), every_k_epochs=5), # prevent learning in the first epoch # MemInitHyperParamSetter('learning_rate_mask',(0,1)), # controls learning rate as a function of epoch HyperParamSetterWithFunc('learning_rate', learning_rate_fun), # GraphProfiler()
help='systolic array width', type=int, default=256) parser.add_argument('--accumulator-array-height', help='accumulator array height', type=int, default=4096) parser.add_argument('--tensorpack-logdir-id', help='TensorPack training log directory id', type=str, default='') parser.add_argument('--mpusim-logdir', help='MPU simulator log directory', type=str, default='.') args = parser.parse_args() model = Model(args.resnet_depth, args.activations_datatype_size_byte, args.weights_datatype_size_byte, args.results_datatype_size_byte, args.systolic_array_height, args.systolic_array_width, args.accumulator_array_height, args.mpusim_logdir) logger.set_logger_dir( os.path.join( 'train_log', 'resnext_{}{}'.format(args.resnet_depth, args.tensorpack_logdir_id))) config = get_config(model) launch_train_with_config(config, SimpleTrainer())
scalar_steps = args.get("scalar_steps") if scalar_steps > 0: scalar_steps = max(scalar_steps // equi_batch_size, 1) else: scalar_steps = 0 # merge scalar summary every epoch # lr starts decreasing at half of max epoch start_dec_epoch = max_epoch // 2 # stops when lr is 0.01 of its initial value end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01) # adjust noise input range according to the input act zmin, zmax = (0, 1) if args.get("act") == "identity" else (-1, 1) if save_folder == None: logger.auto_set_dir() else: logger.set_logger_dir(save_folder) df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax) df = PrintData(df) data = QueueInput(df) SynTexTrainer(data, AdaptiveSynTex(args), n_gpu).train_with_defaults( callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch), PeriodicTrigger(ModelSaver(), every_k_epochs=end_epoch), # save model at last ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr), (max_epoch, 0)], interp="linear"), #PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=10), MergeAllSummaries(period=scalar_steps), # scalar only
"Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy." "Pretrained models listed in README were trained with batch=32x8.") args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu bit_actn, bit_weight = None, None if args.quant: bit_actn, bit_weight = args.quant_bit_actn, args.quant_bit_weight model = Model(args.use_fp16, bit_actn, bit_weight) model.data_format = args.data_format if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-darknet-batch{}'.format(args.batch))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1)) launch_train_with_config(config, trainer)
parser.add_argument("--save-dir") parser.add_argument("--lr", type=float, default=0.1) parser.add_argument("--mult-decay", type=float, default=MULT_DECAY) args = parser.parse_args() NUM_UNITS = args.num_units mult_decay = args.mult_decay lr_base = args.lr save_dir = args.save_dir if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if save_dir is None: logger.auto_set_dir() else: logger.set_logger_dir(save_dir) dataset_train = get_data('train') dataset_test = get_data('test') config = TrainConfig( model=CifarResNet(n=NUM_UNITS, mult_decay=mult_decay, lr_init=lr_base * 0.1), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner( dataset_test, [ScalarStats('cost'), ClassificationError('wrong_vector')]),
del d[key] if args.fix_mean_var: eval_checkpoint = eval_checkpoint.replace('%03d' % original_scale, '%03d' % scale) d_ = tfutils.varmanip.load_chkpt_vars(eval_checkpoint) for key in d.keys(): if 'mean' in key or 'variance' in key: d[key] = d_[key] sessinit = tfutils.sessinit.DictRestore(d) batch = 100 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, scale, sessinit, ds) else: distill = '-distill' if args.kd else '' fixed_qa = '-fixed_qa' if args.fixed_qa else '' note = '-%s' % args.note if args.note is not None else '' note = distill + fixed_qa + note logger_name = '%s%d-%d-%d-%s%s' \ % (args.mode, args.depth, args.qw, args.qa, args.scales.replace(',', '_'), note) logger_dir = os.path.join('train_log', logger_name + args.logdir_id) logger.set_logger_dir(logger_dir, action=args.action) config = get_config(model, scales, distill=args.kd, fake=args.fake, data_aug=True) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
model = Model(args.depth, args.data_format, args.mode, args.wd, args.qw, args.qa, learning_rate=args.lr, data_aug=args.data_aug) if args.eval: batch = 100 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.log_path == '': logger.set_logger_dir( os.path.join( 'train_log', 'imagenet_resnet_d' + str(args.depth) + args.logdir_id), action=None if args.action == '' else args.action) else: logger.set_logger_dir( args.log_path + '/train_log/' + args.logdir_id, action=None if args.action == '' else args.action) config = get_config(model, fake=args.fake, data_aug=args.data_aug) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
choices=[50, 101]) parser.add_argument('--logdir', default='train_log/ResNet-GN') parser.add_argument('--WS', action='store_true', help='Use Weight Standardization') args = parser.parse_args() model = Model() model.depth = args.depth model.use_WS = args.WS if args.eval: batch = 128 # something that can run on one gpu ds = get_imagenet_dataflow(args.data, 'val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir(args.logdir, 'd') try: from tensorpack.tfutils import collect_env_info logger.info("\n" + collect_env_info()) except Exception: pass config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1)) launch_train_with_config(config, trainer)
type=str, default='NCHW') parser.add_argument('-d', '--depth', help='resnet depth', type=int, default=18, choices=[18, 34, 50, 101, 152]) parser.add_argument('--eval', action='store_true') parser.add_argument('--batch', default=256, type=int, help='total batch size. 32 per GPU gives best accuracy, higher values should be similarly good') parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.data_format, args.mode) if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--data', default="/data_a/dataset/imagenet2012", help='ILSVRC dataset dir') parser.add_argument('--load', help='load model') parser.add_argument('--fake', help='use fakedata to test or benchmark this model', action='store_true') parser.add_argument('--data_format', help='specify NCHW or NHWC', type=str, default='NHWC') parser.add_argument('--eval', action='store_true') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.data_format) if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-vgg')) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
"Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy." "Pretrained models listed in README were trained with batch=32x8.") parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.mode) model.data_format = args.data_format if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.mode) model.data_format = args.data_format if args.weight_decay_norm: model.weight_decay_pattern = ".*/W|.*/gamma|.*/beta" if args.eval: batch = 128 # something that can run on one gpu ds = get_imagenet_dataflow(args.data, 'val', batch) eval_classification(model, SmartInit(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join( '/data0/wangguangrun/tensorflow_log/train_log', 'imagenet-{}-d{}-batch{}'.format(args.mode, args.depth, args.batch))) config = get_config(model) config.session_init = SmartInit(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1)) launch_train_with_config(config, trainer)