def load_params(self, args):
        """
        Load arguments
        """

        # Tile Size
        self.tile_size = int(args['--tile_size'])
        # Paths
        self.model_path = args['--model']
        # get absolute path for input directory - otherwise may give error in JP2Image.m
        self.input_dir = os.path.abspath(args['--input_dir'])
        self.output_dir = args['--output_dir']
        rm_n_mkdir(self.output_dir)
        self.logging_dir = args['--logging_dir']
        logging_dir = self.output_dir + '/' + self.logging_dir
        rm_n_mkdir(logging_dir)
        logger.set_logger_dir(logging_dir)

        self.logging_level = args['--logging_level']
        #TODO: this depends on tensorflow getting first crack at the logger (and adding the defailt std_out handler with INFO-level logging)
        logger._logger.handlers[0].setLevel(self.logging_level)
        logger._logger.setLevel(self.logging_level)

        # Processing
        self.batch_size = int(args['--batch_size'])
        # Below specific to WSI processing
        self.return_masks = args['--return_masks']

        self.tiss_lvl = 3  # default WSI level at which perform tissue segmentation
        print(f"'--tissue_level' provided:{args['--tissue_level']}")
        try:
            if args['--tissue_level'] and int(args['--tissue_level']) > 3:
                self.tiss_lvl = int(args['--tissue_level'])
        except:
            pass
def train(args):
    data_folder = args.get("data_folder")
    save_folder = args.get("save_folder")
    image_size = args.get("image_size")
    max_epoch = args.get("max_epoch")
    save_epoch = args.get("save_epoch") or max_epoch // 10
    # Scale lr and steps_per_epoch accordingly.
    # Make sure the total number of gradient evaluations is consistent.
    n_gpu = args.get("n_gpu") or 1
    batch_size = args.get("batch_size") or BATCH
    equi_batch_size = max(n_gpu, 1) * batch_size
    lr = args.get("lr") or LR
    lr *= equi_batch_size
    steps_per_epoch = args.get("steps_per_epoch") or 1000
    steps_per_epoch /= equi_batch_size
    image_steps = args.get("image_steps") or steps_per_epoch // 10
    scalar_steps = args.get("scalar_steps")
    if scalar_steps > 0:
        scalar_steps = max(scalar_steps // equi_batch_size, 1)
    else:
        scalar_steps = 0  # merge scalar summary every epoch
    # lr starts decreasing at half of max epoch
    start_dec_epoch = max_epoch // 2
    # stops when lr is 0.01 of its initial value
    end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01)
    # adjust noise input range according to the input act
    zmin, zmax = (0, 1) if args.get("act_input") == "identity" else (-1, 1)

    if save_folder == None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_folder)

    df = get_data(data_folder,
                  image_size,
                  zmin=zmin,
                  zmax=zmax,
                  batch=batch_size)
    df = PrintData(df)
    data = QueueInput(df)

    SynTexTrainer(data, Style2PO(args), n_gpu).train_with_defaults(
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch),
            PeriodicTrigger(ModelSaver(),
                            every_k_epochs=end_epoch),  # save model at last
            ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr),
                                                        (max_epoch, 0)],
                                      interp="linear"),
            PeriodicTrigger(VisualizeTestSet(data_folder, image_size),
                            every_k_epochs=max(1, max_epoch // 100)),
            #MergeAllSummaries(period=scalar_steps), # scalar only, slowdown in training, use TCMalloc
            MergeAllSummaries(period=image_steps, key="image_summaries"),
            MergeAllSummaries(key="acti_summaries"),
        ],
        max_epoch=end_epoch,
        steps_per_epoch=steps_per_epoch,
        session_init=None)
Exemple #3
0
def get_config():
    log_dir = 'train_log/cifar10-bc-k[%d]-path[%d]-[%d-%d-%d-%d]-' % (int(
        args.k), int(args.path), int(args.block1), int(
            args.block2), int(args.block3), int(args.block4))
    logger.set_logger_dir(log_dir, action='n')

    # prepare dataset
    dataset_train = get_data('train')

    dataset_test = get_data('test')
    #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
    # config = tf.ConfigProto(allow_soft_placement = True,gpu_options=gpu_options)
    # config.gpu_options.allow_growth=True
    # config.gpu_options.per_process_gpu_memory_fraction = 0.4
    callbacks = []
    callbacks.append(ModelSaver())
    nr_tower = len(args.gpu.split(','))
    print('nr_tower = {}'.format(nr_tower))
    steps_per_epoch = dataset_train.size() // nr_tower
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(
            InferenceRunner(dataset_test,
                            [ScalarStats('cost'),
                             ClassificationError()]))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(
                dataset_test, [ScalarStats('cost'),
                               ClassificationError()], list(range(nr_tower))))
        #callbacks.append(InferenceRunner(dataset_test,
        #[ScalarStats('cost',prefix="testing"), ClassificationError(summary_name='validataion_error1')]))

    # callbacks.append(DataParallelInferenceRunner(
    #             dataset_test, [ScalarStats('cost'), ClassificationError()], list(range(nr_tower))))
    callbacks.append(
        ScheduledHyperParamSetter('learning_rate', [(0, 0.1),
                                                    (args.drop_1, 0.01),
                                                    (args.drop_2, 0.001),
                                                    (args.drop_3, 0.0002)]))
    return TrainConfig(
        dataflow=dataset_train,
        # callbacks=[
        #     ModelSaver(),
        #     InferenceRunner(dataset_test,
        #         [ScalarStats('cost'), ClassificationError()]),
        #     ScheduledHyperParamSetter('learning_rate',
        #                               [(1, 0.1), (args.drop_1, 0.01), (args.drop_2, 0.001),(args.drop_2, 0.0001)])
        # ],
        callbacks=callbacks,
        model=Model(args.k, args.path, args.block1, args.block2, args.block3,
                    args.block4),
        steps_per_epoch=steps_per_epoch,
        max_epoch=args.max_epoch,
        #session_config = config,
        nr_tower=nr_tower,
    )
Exemple #4
0
    def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """
        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)

        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        if self.trainer == 'GANTrainer':
            trainer = GANTrainer(model=self.model, input_queue=input_queue)
        elif self.trainer == 'SeparateGANTrainer':
            trainer = SeparateGANTrainer(model=self.model,
                                         input_queue=input_queue)
        else:
            raise ValueError(
                'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer')

        # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue)

        self.restore_path = os.path.join(self.model_dir, 'checkpoint')

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, 'stats.json')) as f:
                starting_epoch = json.load(f)[-1]['epoch_num'] + 1
        else:
            session_init = None
            starting_epoch = 1

        action = 'k' if self.restore_session else None
        logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))

        trainer.train_with_defaults(callbacks=callbacks,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()
 def _init_model(self):
     logger.set_logger_dir("/tmp/test_log/", 'd')
     from dataset import DetectionDataset
     from train import ResNetFPNTrackModel
     # init tensorpack model
     cfg.freeze(False)
     model = ResNetFPNTrackModel()
     DetectionDataset(
     )  # initialize the config with information from our dataset
     finalize_configs(is_training=False)
     return model
Exemple #6
0
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt['train_batch_size'], mode='train')
        valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt['model_flags']
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None),
        ]

        for param_name, param_info in opt['manual_parameters'].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(
                ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))
        if self.model_mode == 'seg_gland':
            callbacks.append(MaxSaver('valid_dice_obj'))
        elif self.model_mode == 'seg_nuc':
            callbacks.append(MaxSaver('valid_dice_np'))
        else:
            callbacks.append(MaxSaver('valid_auc'))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt['nr_epochs'],
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
Exemple #7
0
    def __init__(self, name, need_network=True, need_img=True, model="best"):
        super().__init__(name=name, is_deterministic=True)
        self._resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE,
                                     cfg.PREPROC.MAX_SIZE)
        self._prev_box = None
        self._ff_gt_feats = None
        self._need_network = need_network
        self._need_img = need_img
        self._rotated_bbox = None

        if need_network:
            logger.set_logger_dir(
                "/tmp/test_log_/" + str(random.randint(0, 10000)), 'd')
            if model == "best":
                load = "train_log/hard_mining3/model-1360500"
            elif model == "nohardexamples":
                load = "train_log/condrcnn_all_2gpu_lrreduce2/model-1200500"
            elif model == "newrpn":
                load = "train_log/newrpn1/model"
            elif model == "resnet50_nohardexamples":
                load = "train_log/condrcnn_all_resnet50/model-1200500"
                cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3]
            elif model == "resnet50":
                load = "train_log/hard_mining3_resnet50/model-1360500"
                cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3]
            elif model == "gotonly":
                load = "train_log/hard_mining3_onlygot/model-1361000"
            elif model.startswith("checkpoint:"):
                load = model.replace("checkpoint:", "")
            else:
                assert False, ("unknown model", model)
            from dataset import DetectionDataset
            # init tensorpack model
            # cfg.freeze(False)
            DetectionDataset(
            )  # initialize the config with information from our dataset

            cfg.EXTRACT_GT_FEATURES = True
            cfg.MODE_TRACK = False
            extract_model = ResNetFPNModel()
            extract_ff_feats_cfg = PredictConfig(
                model=extract_model,
                session_init=get_model_loader(load),
                input_names=['image', 'roi_boxes'],
                output_names=['rpn/feature'])
            finalize_configs(is_training=False)
            self._extract_func = OfflinePredictor(extract_ff_feats_cfg)

            cfg.EXTRACT_GT_FEATURES = False
            cfg.MODE_TRACK = True
            cfg.USE_PRECOMPUTED_REF_FEATURES = True
            self._pred_func = self._make_pred_func(load)
Exemple #8
0
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt["train_batch_size"], mode="train")
        valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid")

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt["model_flags"]
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            # ModelSaver(max_to_keep=20), # TODO dynamic this
            ModelSaver(max_to_keep=opt["nr_epochs"]),
            # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'),
        ]

        for param_name, param_info in opt["manual_parameters"].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))
        )
        callbacks.append(MaxSaver("valid_dice"))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt["nr_epochs"],
        )
        config.session_init = sess_init

        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph()  # remove the entire graph in case of multiple runs
        # TODO: save
        return
Exemple #9
0
def run(model):
    instance = Model(model, model.conf.data_format)
    if not model.conf.is_train:
        batch = 64
        dataset = get_data(model.conf.data_dir, 'val', batch)
        eval_on_ILSVRC12(
            instance,
            get_model_loader(model.conf.logdir + '/' + model.conf.test_step),
            dataset)
    else:
        logger.set_logger_dir(os.path.join(model.conf.logdir))
        config = get_config(instance, model.conf)
        if model.conf.reload_step:
            config.session_init = get_model_loader(model.conf.logdir + '/' +
                                                   model.conf.reload_step)
        trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #10
0
    def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(mode='train')
        valid_datagen = self.get_datagen(mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        callbacks = [
            ModelSaver(max_to_keep=200),
            ScheduledHyperParamSetter('learning_rate', self.lr_sched),
        ]
        ######

        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST

        config = TrainConfig(
            model=MODEL_MAKER(freeze),
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=self.nr_epochs,
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
Exemple #11
0
    model = Model()

    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    elif args.flops:
        # manually build the graph with batch=1
        input_desc = [
            InputDesc(tf.float32, [1, 224, 224, 3], 'input'),
            InputDesc(tf.int32, [1], 'label')
        ]
        input = PlaceholderInput()
        input.setup(input_desc)
        with TowerContext('', is_training=True):
            model.build_graph(*input.get_input_tensors())

        tf.profiler.profile(
            tf.get_default_graph(),
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
    else:
        logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))

        nr_tower = max(get_nr_gpu(), 1)
        config = get_config(model, nr_tower)
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_tower))
                        help='variants of resnet to use',
                        default='resnet')
    parser.add_argument('--lp', choices=['2', 'inf'])

    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.mode == 'se':
        assert args.depth >= 50

    nr_tower = max(get_nr_gpu(), 1)
    batch_size = TOTAL_BATCH_SIZE // nr_tower

    model = Model(args.image_size, args.depth, args.data_format, args.mode)
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch, args.image_size)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        logger.set_logger_dir(args.checkpoint_dir)
        config = get_config(model,
                            args.checkpoint_dir,
                            args.image_size,
                            fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #13
0
    model = Model()

    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    elif args.flops:
        # manually build the graph with batch=1
        input_desc = [
            InputDesc(tf.float32, [1, 224, 224, 3], 'input'),
            InputDesc(tf.int32, [1], 'label')
        ]
        input = PlaceholderInput()
        input.setup(input_desc)
        with TowerContext('', is_training=True):
            model.build_graph(*input.get_input_tensors())

        tf.profiler.profile(
            tf.get_default_graph(),
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
    else:
        logger.set_logger_dir(
            os.path.join('train_log', 'shufflenet'))

        nr_tower = max(get_nr_gpu(), 1)
        config = get_config(model, nr_tower)
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
Exemple #14
0
                        type=int,
                        default=18,
                        choices=[18, 34, 50, 101, 152])
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--mode',
                        choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use',
                        default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.mode == 'se':
        assert args.depth >= 50

    model = Model(args.depth, args.data_format, args.mode)
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        logger.set_logger_dir(
            os.path.join('train_log', 'imagenet-resnet-d' + str(args.depth)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #15
0
                           viz=False,
                           saveGif=args.saveGif,
                           saveVideo=args.saveVideo,
                           task='play'), pred, num_validation_files)
        # run episodes in parallel and evaluate pretrained model
        elif args.task == 'eval':
            play_n_episodes(
                get_player(directory=data_dir,
                           files_list=eval_list,
                           viz=False,
                           saveGif=args.saveGif,
                           saveVideo=args.saveVideo,
                           task='eval'), pred, num_files)
    else:  # train model

        logger.set_logger_dir(logger_dir)
        config = get_config()
        if args.load:  # resume training from a saved checkpoint
            config.session_init = get_model_loader(args.load)

        launch_train_with_config(config, SimpleTrainer())

        # # FOR PROFILING
        # NUM_EPOCHS = 2
        # import cProfile
        # import pstats
        # profiler = cProfile.Profile()
        # profiler.runctx('launch_train_with_config(config, SimpleTrainer())', globals(), locals())
        #
        # stats = pstats.Stats(profiler)
        # stats.strip_dirs()
Exemple #16
0
    )
    parser.add_argument('--mode',
                        choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use',
                        default='resnet')
    parser.add_argument('--log_dir', type=str, default='')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.mode)
    model.data_format = args.data_format
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            log_foder = '/data0/wangguangrun/log_acnt/imagenet-resnet-%s' % (
                args.log_dir)
            logger.set_logger_dir(os.path.join(log_foder))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #17
0
        eval_on_AVA2012(model, get_model_loader(args.load), ds, args.repeat_times)

    else:
        # add @ 20171128: the strategy of parameter initalization with a ImageNet pre-trained model
        #                 should be recorded within the name string of the directory of training log
        initial_strategy = '_fromScratch'
        if args.load:
            initial_strategy = '_preTrainedModel'
        elif args.load_npy:
            initial_strategy = '_preTrainedImageNetModel'

        # change @ 20180705
        # introduce An for the AESTHETIC_LEVEL is set to n
        logger.set_logger_dir('./train_log/AVA2012{6}-{0}-d{1}-{2}-{3}{4}{5}_LRT3'.format(args.mode, args.depth, \
            args.crop_method_TR, args.crop_method_TS, initial_strategy, \
            '' if args.JensenFactor == 0.0 else '_JE{}'.format(args.JensenFactor), \
            '' if args.aesthetic_level == AESTHETIC_LEVEL else '-A{}'.format(args.aesthetic_level)))

        config = get_config(model, args.data, args.crop_method_TR, args.color_augmentation, args.crop_method_TS)

        # load pre-trained model if it exists
        # TODO: layer-cascade or freeze-layer ? rely-backpropagation ?
        #       layer-wise adaptive scale rate ?
        if args.load:
            print('--> initialize the session with the checkpoint file %s', args.load)
            config.session_init = get_model_loader(args.load)

        elif args.load_npy:
            print('--> initalize the session with the npy file %s', args.load)
            # add @ 20171128: adopt the ImageNet pre-trained model for initialization purpose
            #                 load params from npy file, convert them into the desired formation,
Exemple #18
0
    parser.add_argument('--data_format',
                        help='specify NCHW or NHWC',
                        type=str,
                        default='NHWC')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument(
        '--batch_size_per_gpu',
        default=32,
        type=int,
        help=
        'total batch size. 32 per GPU gives best accuracy, higher values should be similarly good'
    )
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.data_format)
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        logger.set_logger_dir(os.path.join('train_log', 'vgg'))

        config = get_config(model)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #19
0
                                screen_dims=IMAGE_SIZE,
                                spacing=SPACING)
    NUM_ACTIONS = init_player.action_space.n
    num_validation_files = init_player.files.num_files

    if args.task != 'train':
        assert args.load is not None
        pred = OfflinePredictor(PredictConfig(
            model=Model(),
            session_init=get_model_loader(args.load),
            input_names=['state'],
            output_names=['Qvalue']))
        if args.task == 'play':
            t0 = time.time()
            play_n_episodes(get_player(directory=data_dir,
                                       files_list=test_list, viz=0.01,
                                       saveGif=args.saveGif,
                                       saveVideo=args.saveVideo),
                            pred, num_validation_files)

            t1 = time.time()
            print(t1-t0)
        elif args.task == 'eval':
            eval_model_multithread(pred, EVAL_EPISODE, get_player)
    else:
        logger.set_logger_dir(logger_dir) # todo: variable log dir
        config = get_config()
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(config, SimpleTrainer())
Exemple #20
0
                        type=float, default=0.1)
    parser.add_argument('--log_path', help='path of log',
                        type=str, default='')
    parser.add_argument('--action', help='action type',
                        type=str, default='')
    args = parser.parse_args()

    TOTAL_BATCH_SIZE = args.batch_size
    imagenet_utils.DEFAULT_IMAGE_SHAPE = args.input_size

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.data_format, args.mode, args.wd, args.qw, args.qa, learning_rate=args.lr, data_aug=args.data_aug)
    if args.eval:
        batch = 100  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.log_path == '':
            logger.set_logger_dir(
                os.path.join('train_log', 'imagenet_resnet_d' + str(args.depth) + args.logdir_id), action=None if args.action == '' else args.action)
        else:
            logger.set_logger_dir(args.log_path + '/train_log/' + args.logdir_id, action=None if args.action == '' else args.action)

        config = get_config(model, fake=args.fake, data_aug=args.data_aug)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
                        action='store_true')
    parser.add_argument('--data_format',
                        help='specify NCHW or NHWC',
                        type=str,
                        default='NCHW')
    parser.add_argument('-d',
                        '--depth',
                        help='resnet depth',
                        type=int,
                        default=18,
                        choices=[18, 34, 50, 101])
    parser.add_argument('--eval', action='store_true')
    args = parser.parse_args()

    DEPTH = args.depth
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.eval:
        BATCH_SIZE = 128  # something that can run on one gpu
        ds = get_data('val')
        eval_on_ILSVRC12(Model(), args.load, ds)
        sys.exit()

    logger.set_logger_dir(
        os.path.join('train_log', 'imagenet-resnet-d' + str(DEPTH)))
    config = get_config(fake=args.fake, data_format=args.data_format)
    if args.load:
        config.session_init = SaverRestore(args.load)
    SyncMultiGPUTrainerParameterServer(config).train()
Exemple #22
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--logdir', help='logdir', default='')
    args = parser.parse_args()

    # P_py  = np.load('/jasper/models/gp140/P_py.npy')
    Ppy = np.load('/jasper/models/BetaGal/betagal1.5_projections.npy')

    Ppy = Ppy[0]  # leave only first symmetric unit
    vlen, nviews = Ppy.shape[-1], Ppy.shape[0]

    os.environ['CUDA_VISIBLE_DEVICES'] = get_visible_device_list(3)
    global_step = get_global_step_var()

    # set logger directory for checkpoints, etc
    logger.set_logger_dir(args.logdir, action='k')

    steps_per_epoch = cfg.EPOCH_STEPS
    model = Model(vlen, nviews)
    # config.gpu_options.allow_growth = True
    traincfg = TrainConfig(
        model=model,
        data=QueueInput(ProjDataFlow(Ppy)),
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=5),
            PeriodicTrigger(VolumeSaver(model), every_k_epochs=5),
            # prevent learning in the first epoch
            # MemInitHyperParamSetter('learning_rate_mask',(0,1)),
            # controls learning rate as a function of epoch
            HyperParamSetterWithFunc('learning_rate', learning_rate_fun),
            # GraphProfiler()
Exemple #23
0
                        help='systolic array width',
                        type=int,
                        default=256)
    parser.add_argument('--accumulator-array-height',
                        help='accumulator array height',
                        type=int,
                        default=4096)
    parser.add_argument('--tensorpack-logdir-id',
                        help='TensorPack training log directory id',
                        type=str,
                        default='')
    parser.add_argument('--mpusim-logdir',
                        help='MPU simulator log directory',
                        type=str,
                        default='.')
    args = parser.parse_args()

    model = Model(args.resnet_depth, args.activations_datatype_size_byte,
                  args.weights_datatype_size_byte,
                  args.results_datatype_size_byte, args.systolic_array_height,
                  args.systolic_array_width, args.accumulator_array_height,
                  args.mpusim_logdir)

    logger.set_logger_dir(
        os.path.join(
            'train_log', 'resnext_{}{}'.format(args.resnet_depth,
                                               args.tensorpack_logdir_id)))

    config = get_config(model)
    launch_train_with_config(config, SimpleTrainer())
    scalar_steps = args.get("scalar_steps")
    if scalar_steps > 0:
        scalar_steps = max(scalar_steps // equi_batch_size, 1)
    else:
        scalar_steps = 0  # merge scalar summary every epoch
    # lr starts decreasing at half of max epoch
    start_dec_epoch = max_epoch // 2
    # stops when lr is 0.01 of its initial value
    end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01)
    # adjust noise input range according to the input act
    zmin, zmax = (0, 1) if args.get("act") == "identity" else (-1, 1)

    if save_folder == None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_folder)

    df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax)
    df = PrintData(df)
    data = QueueInput(df)

    SynTexTrainer(data, AdaptiveSynTex(args), n_gpu).train_with_defaults(
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch),
            PeriodicTrigger(ModelSaver(),
                            every_k_epochs=end_epoch),  # save model at last
            ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr),
                                                        (max_epoch, 0)],
                                      interp="linear"),
            #PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=10),
            MergeAllSummaries(period=scalar_steps),  # scalar only
Exemple #25
0
        "Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy."
        "Pretrained models listed in README were trained with batch=32x8.")
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    bit_actn, bit_weight = None, None
    if args.quant:
        bit_actn, bit_weight = args.quant_bit_actn, args.quant_bit_weight

    model = Model(args.use_fp16, bit_actn, bit_weight)
    model.data_format = args.data_format
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log',
                             'imagenet-darknet-batch{}'.format(args.batch)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #26
0
    parser.add_argument("--save-dir")
    parser.add_argument("--lr", type=float, default=0.1)
    parser.add_argument("--mult-decay", type=float, default=MULT_DECAY)
    args = parser.parse_args()
    NUM_UNITS = args.num_units
    mult_decay = args.mult_decay
    lr_base = args.lr
    save_dir = args.save_dir

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if save_dir is None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_dir)

    dataset_train = get_data('train')
    dataset_test = get_data('test')

    config = TrainConfig(
        model=CifarResNet(n=NUM_UNITS,
                          mult_decay=mult_decay,
                          lr_init=lr_base * 0.1),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(
                dataset_test,
                [ScalarStats('cost'),
                 ClassificationError('wrong_vector')]),
Exemple #27
0
            del d[key]
        if args.fix_mean_var:
            eval_checkpoint = eval_checkpoint.replace('%03d' % original_scale,
                                                      '%03d' % scale)
            d_ = tfutils.varmanip.load_chkpt_vars(eval_checkpoint)
            for key in d.keys():
                if 'mean' in key or 'variance' in key:
                    d[key] = d_[key]
        sessinit = tfutils.sessinit.DictRestore(d)
        batch = 100  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, scale, sessinit, ds)
    else:
        distill = '-distill' if args.kd else ''
        fixed_qa = '-fixed_qa' if args.fixed_qa else ''
        note = '-%s' % args.note if args.note is not None else ''
        note = distill + fixed_qa + note
        logger_name = '%s%d-%d-%d-%s%s' \
            % (args.mode, args.depth, args.qw, args.qa, args.scales.replace(',', '_'), note)
        logger_dir = os.path.join('train_log', logger_name + args.logdir_id)
        logger.set_logger_dir(logger_dir, action=args.action)
        config = get_config(model,
                            scales,
                            distill=args.kd,
                            fake=args.fake,
                            data_aug=True)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #28
0
    model = Model(args.depth,
                  args.data_format,
                  args.mode,
                  args.wd,
                  args.qw,
                  args.qa,
                  learning_rate=args.lr,
                  data_aug=args.data_aug)
    if args.eval:
        batch = 100  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.log_path == '':
            logger.set_logger_dir(
                os.path.join(
                    'train_log',
                    'imagenet_resnet_d' + str(args.depth) + args.logdir_id),
                action=None if args.action == '' else args.action)
        else:
            logger.set_logger_dir(
                args.log_path + '/train_log/' + args.logdir_id,
                action=None if args.action == '' else args.action)

        config = get_config(model, fake=args.fake, data_aug=args.data_aug)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
                        choices=[50, 101])
    parser.add_argument('--logdir', default='train_log/ResNet-GN')
    parser.add_argument('--WS',
                        action='store_true',
                        help='Use Weight Standardization')
    args = parser.parse_args()

    model = Model()
    model.depth = args.depth
    model.use_WS = args.WS
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_imagenet_dataflow(args.data, 'val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(args.logdir, 'd')

        try:
            from tensorpack.tfutils import collect_env_info
            logger.info("\n" + collect_env_info())
        except Exception:
            pass
        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
        launch_train_with_config(config, trainer)
                        type=str, default='NCHW')
    parser.add_argument('-d', '--depth', help='resnet depth',
                        type=int, default=18, choices=[18, 34, 50, 101, 152])
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--batch', default=256, type=int,
                        help='total batch size. 32 per GPU gives best accuracy, higher values should be similarly good')
    parser.add_argument('--mode', choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use', default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.data_format, args.mode)
    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #31
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--data', default="/data_a/dataset/imagenet2012", help='ILSVRC dataset dir')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--fake', help='use fakedata to test or benchmark this model', action='store_true')
    parser.add_argument('--data_format', help='specify NCHW or NHWC',
                        type=str, default='NHWC')
    parser.add_argument('--eval', action='store_true')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu


    model = Model(args.data_format)
    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        logger.set_logger_dir(
            os.path.join('train_log', 'imagenet-vgg'))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #32
0
        "Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy."
        "Pretrained models listed in README were trained with batch=32x8.")
    parser.add_argument('--mode',
                        choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use',
                        default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.mode)
    model.data_format = args.data_format
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log',
                             'imagenet-{}-d{}'.format(args.mode, args.depth)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Exemple #33
0
                        help='variants of resnet to use',
                        default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.mode)
    model.data_format = args.data_format
    if args.weight_decay_norm:
        model.weight_decay_pattern = ".*/W|.*/gamma|.*/beta"

    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_imagenet_dataflow(args.data, 'val', batch)
        eval_classification(model, SmartInit(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join(
                    '/data0/wangguangrun/tensorflow_log/train_log',
                    'imagenet-{}-d{}-batch{}'.format(args.mode, args.depth,
                                                     args.batch)))

        config = get_config(model)
        config.session_init = SmartInit(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
        launch_train_with_config(config, trainer)