Exemple #1
0
    def __init__(self, config):
        """
        GANTrainer expects a ModelDesc in config which sets the following attribute
        after :meth:`_build_graph`: g_loss, d_loss, g_vars, d_vars.
        """
        input = QueueInput(config.dataflow)
        model = config.model

        cbs = input.setup(model.get_inputs_desc())
        config.callbacks.extend(cbs)

        with TowerContext('', is_training=True):
            model.build_graph(input)
        opt = model.get_optimizer()

        # by default, run one d_min after one g_min
        with tf.name_scope('optimize'):
            g_min = opt.minimize(model.g_loss,
                                 var_list=model.g_vars,
                                 name='g_op')
            with tf.control_dependencies([g_min]):
                d_min = opt.minimize(model.d_loss,
                                     var_list=model.d_vars,
                                     name='d_op')
        self.train_op = d_min

        super(GANTrainer, self).__init__(config)
Exemple #2
0
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn(
            "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported."
        )
    if fake:
        data = QueueInput(
            FakeData([[batch, 224, 224, 3], [batch]],
                     1000,
                     random=False,
                     dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, min(START_LR, BASE_LR)),
                                       (30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (90, BASE_LR * 1e-3),
                                       (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=105,
    )
Exemple #3
0
    def __init__(self, config, d_period=1, g_period=1):
        """
        Args:
            d_period(int): period of each d_opt run
            g_period(int): period of each g_opt run
        """
        self._d_period = int(d_period)
        self._g_period = int(g_period)
        assert min(d_period, g_period) == 1

        input = QueueInput(config.dataflow)
        model = config.model

        cbs = input.setup(model.get_inputs_desc())
        config.callbacks.extend(cbs)
        with TowerContext('', is_training=True):
            model.build_graph(input)

        opt = model.get_optimizer()
        with tf.name_scope('optimize'):
            self.d_min = opt.minimize(model.d_loss,
                                      var_list=model.d_vars,
                                      name='d_min')
            self.g_min = opt.minimize(model.g_loss,
                                      var_list=model.g_vars,
                                      name='g_min')

        super(SeparateGANTrainer, self).__init__(config)
    def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """

        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)
        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        from tensorpack.callbacks import CometMLMonitor

        trainer = SeparateGANTrainer(
            model=self.model,
            input_queue=input_queue,
            g_period=6,
        )

        self.restore_path = os.path.join(self.model_dir, 'checkpoint')

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, 'stats.json')) as f:
                starting_epoch = json.load(f)[-1]['epoch_num'] + 1

        else:
            session_init = None
            starting_epoch = 1

        action = 'k' if self.restore_session else 'd'
        # logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        monitors = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))
        callbacks.append(MergeAllSummaries(period=10))

        if self.experiment is not None:
            monitors.append(CometMLMonitor(experiment=self.experiment))

        trainer.train_with_defaults(callbacks=callbacks,
                                    monitors=monitors,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
        steps_per_epoch = 100
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))

        dataset_train = get_imagenet_dataflow(args.data, 'train', batch)
        dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch))
        steps_per_epoch = 1281167 // args.batch

        BASE_LR = 0.1 * args.batch / 256.0
        logger.info("BASELR: {}".format(BASE_LR))
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            GPUUtilizationTracker(),
            ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR),
                                                        (30, BASE_LR * 1e-1),
                                                        (60, BASE_LR * 1e-2),
                                                        (90, BASE_LR * 1e-3)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate',
                                          [(0, 0.1),
                                           (5 * steps_per_epoch, BASE_LR)],
                                          interp='linear',
                                          step_based=True))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=steps_per_epoch,
        max_epoch=100,
    )
Exemple #6
0
 def __init__(self, config):
     self._nr_gpu = config.nr_tower
     assert self._nr_gpu > 1
     self._raw_devices = ['/gpu:{}'.format(k) for k in config.tower]
     self._input_source = StagingInputWrapper(QueueInput(config.dataflow),
                                              self._raw_devices)
     super(MultiGPUGANTrainer, self).__init__(config)
Exemple #7
0
def get_config(model, nr_tower):
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2),
                                                    (60, 3e-3), (90, 3e-4)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=100,
    )
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower
    )
Exemple #9
0
 def __init__(self, config, d_interval=1):
     """
     Args:
         d_interval: will run d_opt only after this many of g_opt.
     """
     self._input_method = QueueInput(config.dataflow)
     self._d_interval = d_interval
     super(SeparateGANTrainer, self).__init__(config)
Exemple #10
0
def train(args):
    data_folder = args.get("data_folder")
    save_folder = args.get("save_folder")
    image_size = args.get("image_size")
    max_epoch = args.get("max_epoch")
    save_epoch = args.get("save_epoch") or max_epoch // 10
    # Scale lr and steps_per_epoch accordingly.
    # Make sure the total number of gradient evaluations is consistent.
    n_gpu = args.get("n_gpu") or 1
    batch_size = args.get("batch_size") or BATCH
    equi_batch_size = max(n_gpu, 1) * batch_size
    lr = args.get("lr") or LR
    lr *= equi_batch_size
    steps_per_epoch = args.get("steps_per_epoch") or 1000
    steps_per_epoch /= equi_batch_size
    image_steps = args.get("image_steps") or steps_per_epoch // 10
    scalar_steps = args.get("scalar_steps")
    if scalar_steps > 0:
        scalar_steps = max(scalar_steps // equi_batch_size, 1)
    else:
        scalar_steps = 0  # merge scalar summary every epoch
    # lr starts decreasing at half of max epoch
    start_dec_epoch = max_epoch // 2
    # stops when lr is 0.01 of its initial value
    end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01)
    # adjust noise input range according to the input act
    zmin, zmax = (0, 1) if args.get("act_input") == "identity" else (-1, 1)

    if save_folder == None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_folder)

    df = get_data(data_folder,
                  image_size,
                  zmin=zmin,
                  zmax=zmax,
                  batch=batch_size)
    df = PrintData(df)
    data = QueueInput(df)

    SynTexTrainer(data, Style2PO(args), n_gpu).train_with_defaults(
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch),
            PeriodicTrigger(ModelSaver(),
                            every_k_epochs=end_epoch),  # save model at last
            ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr),
                                                        (max_epoch, 0)],
                                      interp="linear"),
            PeriodicTrigger(VisualizeTestSet(data_folder, image_size),
                            every_k_epochs=max(1, max_epoch // 100)),
            #MergeAllSummaries(period=scalar_steps), # scalar only, slowdown in training, use TCMalloc
            MergeAllSummaries(period=image_steps, key="image_summaries"),
            MergeAllSummaries(key="acti_summaries"),
        ],
        max_epoch=end_epoch,
        steps_per_epoch=steps_per_epoch,
        session_init=None)
Exemple #11
0
def get_config(model, fake=False):
    start_ = 0
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if fake:
        dataset_train = FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        if start_ < 31:
            lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 61:
            lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 91:
            lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        else:
            print('not found learning rate setting!!!!!!!!!!!!!')

        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', lr_setting),
            # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0'])
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=120,
    )
Exemple #12
0
 def __init__(self, config, g_vs_d=1):
     self._input_method = QueueInput(config.dataset)
     super(GANTrainer, self).__init__(config)
     if g_vs_d > 1:
         self._opt_g = g_vs_d
         self._opt_d = 1
     else:
         self._opt_g = 1
         self._opt_d = int(1.0 / g_vs_d)
Exemple #13
0
def get_config(model, scales, distill=False, fake=False, data_aug=True):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, data_aug)
        dataset_val = get_data('val', batch, data_aug)
        callbacks = [ModelSaver()]
        if data_aug:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(30, 1e-2),
                                                            (60, 1e-3),
                                                            (85, 1e-4),
                                                            (95, 1e-5),
                                                            (105, 1e-6)]))
        callbacks.append(HumanHyperParamSetter('learning_rate'))
        infs = []
        for scale in scales:
            infs.append(
                ClassificationError('wrong-scale%03d-top1' % scale,
                                    'val-error-scale%03d-top1' % scale))
            infs.append(
                ClassificationError('wrong-scale%03d-top5' % scale,
                                    'val-error-scale%03d-top5' % scale))
        if distill:
            infs.append(
                ClassificationError('wrong-scale_ensemble-top1',
                                    'val-error-scale_ensemble-top1'))
            infs.append(
                ClassificationError('wrong-scale_ensemble-top5',
                                    'val-error-scale_ensemble-top5'))
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000,
        max_epoch=120 if data_aug else 64,
        nr_tower=nr_tower)
Exemple #14
0
    def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """
        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)

        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        if self.trainer == 'GANTrainer':
            trainer = GANTrainer(model=self.model, input_queue=input_queue)
        elif self.trainer == 'SeparateGANTrainer':
            trainer = SeparateGANTrainer(model=self.model,
                                         input_queue=input_queue)
        else:
            raise ValueError(
                'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer')

        # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue)

        self.restore_path = os.path.join(self.model_dir, 'checkpoint')

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, 'stats.json')) as f:
                starting_epoch = json.load(f)[-1]['epoch_num'] + 1
        else:
            session_init = None
            starting_epoch = 1

        action = 'k' if self.restore_session else None
        logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))

        trainer.train_with_defaults(callbacks=callbacks,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()
Exemple #15
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (85, BASE_LR * 1e-3),
                                       (95, BASE_LR * 1e-4),
                                       (105, BASE_LR * 1e-5)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, 0.1),
                                                            (3, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=110,
    )
Exemple #16
0
 def __init__(self, config, d_period=1, g_period=1):
     """
     Args:
         d_period(int): period of each d_opt run
         g_period(int): period of each g_opt run
     """
     self._input_source = QueueInput(config.dataflow)
     self._d_period = int(d_period)
     self._g_period = int(g_period)
     assert min(d_period, g_period) == 1
     super(SeparateGANTrainer, self).__init__(config)
Exemple #17
0
def get_config(model):
    batch = 1

    logger.info("For benchmark, batch size is fixed to 1 per tower.")
    data = QueueInput(
        FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8'))

    return TrainConfig(model=model,
                       data=data,
                       callbacks=[],
                       steps_per_epoch=1,
                       max_epoch=1)
Exemple #18
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4),
                                       (60, 5e-5), (80, 2.5e-5),
                                       (100, 1.25e-5), (120, 5e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        # Finetune COCO
        #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]
        #JT COCO
        #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)]
        #Fintune to VOC
        #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)]

        #infs = [ClassificationError('wrong-top1', 'val-error-top1'),
        #        ClassificationError('wrong-top5', 'val-error-top5')]
        infs = [
            ClassificationError('loss-wrong-top1', 'loss-val-error-top1'),
            ClassificationError('loss-wrong-top5', 'loss-val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=1522,
                       max_epoch=140,
                       nr_tower=nr_tower)
Exemple #19
0
def run(args):
    df_train = s2b_df(args.train_dir_face, args.train_dir_bitmoji,
                      args.batch_size, args.num_threads)
    df_test = s2b_df(args.test_dir_face, args.test_dir_bitmoji,
                     args.batch_size, args.num_threads)

    def update_lr(epoch, cur_lr):
        """ Approximate exponential decay of the learning rate """
        if args.resume_lr:
            return cur_lr * args.decay
        else:
            return args.lr * args.decay**epoch

    callbacks = [
        cb.ModelSaver(),
        cb.MinSaver('val-error-top1'),
        cb.HyperParamSetterWithFunc('LR', update_lr),
        # cb.HyperParamSetterWithFunc('Instance_Noise_Stddev', lambda epoch, stddev: stddev * args.decay),
        # cb.HyperParamSetterWithFunc('D_Uncertainty_Threshold', lambda epoch, threshold: threshold * args.decay),
        cb.MergeAllSummaries(period=args.summary_freq),
    ]
    infs = [
        cb.ScalarStats(
            ['L_c', 'L_const', 'L_gan_d', 'L_gan_g', 'L_tid', 'L_tv'])
    ]

    if get_nr_gpu() > 0:
        callbacks.append(cb.GPUUtilizationTracker())

    callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs))

    S2BTrainer(QueueInput(df_train),
               Selfie2BitmojiModel(args)).train_with_defaults(
                   callbacks=callbacks,
                   max_epoch=args.epochs,
                   steps_per_epoch=df_train.size(),
                   session_init=SaverRestore(args.load_path))
def get_config(model, fake=False, xla=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

        config = tf.ConfigProto()
        jit_level = 0
        if xla:
            # Turns on XLA JIT compilation
            jit_level = tf.OptimizerOptions.ON_1
        config.graph_options.optimizer_options.global_jit_level = jit_level

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=10,
                       max_epoch=1,
                       nr_tower=nr_tower)
Exemple #21
0
def get_config(args, model, num_gpus, num_towers):
    """
    Create the TensorPack Trainer configuration.

    :param args: The cli arguments.
    :param model: The model object to train.
    :param num_gpus: The number of gpus on which to train
    :param num_towers: The number of data parallel towers to create

    :return: A TrainConfig object.
    """
    logger.info("Running on {} towers. Batch size per tower: {}".format(
        num_towers, args.batch_size))

    df_train = avatar_synth_df(args.train_dir, args.batch_size,
                               args.num_threads)
    df_test = avatar_synth_df(args.test_dir, args.batch_size, args.num_threads)

    def update_lr(epoch, cur_lr):
        """ Approximate exponential decay of the learning rate """
        if args.resume_lr:
            return cur_lr * args.lr_decay
        else:
            return args.lr * args.lr_decay**epoch

    callbacks = [
        cb.ModelSaver(),
        cb.MinSaver('val-error-top1'),
        cb.HyperParamSetterWithFunc('tower0/Avatar_Synth/LR:0', update_lr),
        cb.MergeAllSummaries(period=args.summary_freq),
    ]
    infs = [cb.ScalarStats('Avatar_Synth/Cost')]

    if num_gpus > 0:
        callbacks.append(cb.GPUUtilizationTracker())

    if num_towers == 1:  # single-GPU inference with queue prefetch
        callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs))
    else:  # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            cb.DataParallelInferenceRunner(df_test, infs,
                                           list(range(num_towers))))

    return TrainConfig(model=model,
                       dataflow=df_train,
                       callbacks=callbacks,
                       max_epoch=args.epochs,
                       nr_tower=num_towers)
Exemple #22
0
    def get_data(self, name, num_gpu):
        gpu_batch = self.batch_size // num_gpu

        assert name in ['train', 'val', 'test']
        isTrain = name == 'train'

        augmentors = fbresnet_augmentor(isTrain)
        assert isinstance(augmentors, list)

        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

        if isTrain:
            ds = dataset.ILSVRC12(self.datadir,
                                  name,
                                  shuffle=True,
                                  dir_structure='train')
            ds = AugmentImageComponent(ds, augmentors, copy=False)
            ds = MultiProcessRunnerZMQ(ds, parallel)
            ds = BatchData(ds, gpu_batch, remainder=False)
            #ds = QueueInput(ds)
        else:
            ds = dataset.ILSVRC12Files(self.datadir,
                                       name,
                                       shuffle=False,
                                       dir_structure='train')
            aug = imgaug.AugmentorList(augmentors)

            def mapf(dp):
                fname, cls = dp
                im = cv2.imread(fname, cv2.IMREAD_COLOR)
                im = aug.augment(im)
                return im, cls

            ds = MultiThreadMapData(ds,
                                    parallel,
                                    mapf,
                                    buffer_size=2000,
                                    strict=True)
            ds = BatchData(ds, gpu_batch, remainder=True)
            ds = MultiProcessRunnerZMQ(ds, 1)

            if num_gpu == 1:
                ds = QueueInput(ds)
        return ds
def get_config(model, checkpoint_dir, target_shape, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, target_shape)
        dataset_val = get_data('val', batch, target_shape)
        callbacks = [
            ModelSaver(checkpoint_dir=checkpoint_dir),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))


# 7.5 it / sec testing
    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 300,  #5000 
        max_epoch=110,
        nr_tower=nr_tower)
Exemple #24
0
def get_config(model):
    nr_tower = get_nr_gpu()

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, args.batch_size_per_gpu))
    dataset_train = get_data('train', args.batch_size_per_gpu)
    dataset_val = get_data('val', args.batch_size_per_gpu)

    BASE_LR = 1e-3 * (args.batch_size_per_gpu * nr_tower / 256.0)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR),
                                                    (60, BASE_LR * 1e-1),
                                                    (90, BASE_LR * 1e-2)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    '''
    if BASE_LR > 0.1:
        callbacks.append(
            ScheduledHyperParamSetter(
                'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'))
    '''

    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=1280000 // (args.batch_size_per_gpu * nr_tower),
        max_epoch=110,
    )
Exemple #25
0
def get_config(model, data_dir, crop_method_TR, color_augmentation, crop_method_TS):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

    # data pipelines of train and validation
    dataset_train = get_data('train', data_dir, batch, crop_method_TR, \
        color_augmentation = color_augmentation, CAM_dir_pkl = CAM_DIR_PKL)
    dataset_val = get_data('val', data_dir, batch, crop_method_TS)

    # TODO
    callbacks = [
        # class callbacks.ModelSaver(max_to_keep = 10, keep_checkpoint_every_n_hours = 0.5,
        #     checkpoint_dir = None, var_collections = 'variables')
        ModelSaver(max_to_keep = MAX_EPOCH),
        # @ 20171129: finetune on ResNet d18 from ImageNet
        # maybe moderate learning_rate is perferable
        ScheduledHyperParamSetter('learning_rate',
                                  [(0, 1e-3), (20, 5e-4), (40, 1e-4), (60, 1e-5)]),
        HumanHyperParamSetter('learning_rate'),
    ]

    # 0 or 1
    infs = [ClassificationError('wrong-top1', 'val-error-top1')]

    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))

    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model           = model,
        dataflow        = dataset_train,
        callbacks       = callbacks,
        #steps_per_epoch = 5000,
        max_epoch       = MAX_EPOCH,
        nr_tower        = nr_tower
    )
Exemple #26
0
def get_config(model, conf):
    nr_tower = max(get_nr_gpu(), 1)
    batch = conf.batch
    if conf.fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data(conf.data_dir, 'train', batch)
        dataset_val = get_data(conf.data_dir, 'val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(45, 1e-2), (60, 1e-3),
                                                        (65, 1e-4), (70, 1e-5),
                                                        (75, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))
    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=5000,
                       max_epoch=80,
                       nr_tower=nr_tower)
Exemple #27
0
    def __init__(self, config):
        nr_gpu = config.nr_tower
        assert nr_gpu > 1
        raw_devices = ['/gpu:{}'.format(k) for k in config.tower]

        # setup input
        input = StagingInputWrapper(QueueInput(config.dataflow), raw_devices)
        model = config.model
        cbs = input.setup(model.get_inputs_desc())
        config.callbacks.extend(cbs)

        def get_cost():
            model.build_graph(input)
            return [model.d_loss, model.g_loss]

        devices = [
            LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices
        ]
        cost_list = MultiGPUTrainerBase.build_on_multi_tower(
            config.tower, get_cost, devices)
        # simply average the cost. It might get faster to average the gradients
        with tf.name_scope('optimize'):
            d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / nr_gpu)
            g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / nr_gpu)

            opt = model.get_optimizer()
            # run one d_min after one g_min
            g_min = opt.minimize(g_loss,
                                 var_list=model.g_vars,
                                 colocate_gradients_with_ops=True,
                                 name='g_op')
            with tf.control_dependencies([g_min]):
                d_min = opt.minimize(d_loss,
                                     var_list=model.d_vars,
                                     colocate_gradients_with_ops=True,
                                     name='d_op')
        self.train_op = d_min
        super(MultiGPUGANTrainer, self).__init__(config)
Exemple #28
0
def get_config(model, nr_tower):
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    step_size = 1280000 // TOTAL_BATCH_SIZE
    max_iter = 3 * 10**5
    max_epoch = (max_iter // step_size) + 1
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, 0.5), (max_iter, 0)],
                                  interp='linear',
                                  step_based=True),
    ]
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=step_size,
        max_epoch=max_epoch,
    )
Exemple #29
0
        M.add(KL.Conv2D(32, 3, padding='same', activation='relu'))

        M.add(KL.Flatten())
        M.add(
            KL.Dense(512,
                     activation='relu',
                     kernel_regularizer=keras.regularizers.l2(1e-5)))
        M.add(KL.Dropout(0.5))
        M.add(
            KL.Dense(10,
                     activation=None,
                     kernel_regularizer=keras.regularizers.l2(1e-5)))
        M.add(KL.Activation('softmax'))
        return M

    dataset_train, dataset_test = get_data()

    M = KerasModel(model_func,
                   inputs_desc=[
                       InputDesc(tf.float32, [None, IMAGE_SIZE, IMAGE_SIZE, 1],
                                 'images')
                   ],
                   targets_desc=[InputDesc(tf.float32, [None, 10], 'labels')],
                   input=QueueInput(dataset_train))
    M.compile(optimizer=tf.train.AdamOptimizer(1e-3),
              loss='categorical_crossentropy',
              metrics='categorical_accuracy')
    M.fit(validation_data=dataset_test,
          steps_per_epoch=dataset_train.size(),
          callbacks=[ModelSaver()])
        scalar_steps = 0  # merge scalar summary every epoch
    # lr starts decreasing at half of max epoch
    start_dec_epoch = max_epoch // 2
    # stops when lr is 0.01 of its initial value
    end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01)
    # adjust noise input range according to the input act
    zmin, zmax = (0, 1) if args.get("act") == "identity" else (-1, 1)

    if save_folder == None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_folder)

    df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax)
    df = PrintData(df)
    data = QueueInput(df)

    SynTexTrainer(data, AdaptiveSynTex(args), n_gpu).train_with_defaults(
        callbacks=[
            PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch),
            PeriodicTrigger(ModelSaver(),
                            every_k_epochs=end_epoch),  # save model at last
            ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr),
                                                        (max_epoch, 0)],
                                      interp="linear"),
            #PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=10),
            MergeAllSummaries(period=scalar_steps),  # scalar only
            MergeAllSummaries(period=image_steps, key="image_summaries"),
        ],
        max_epoch=end_epoch,
        steps_per_epoch=steps_per_epoch,