Esempio n. 1
0
    def train(self):
        assert self.mode == 'train', "Model not in 'train' mode"

        # if no given weights loaded, load backbone pretrain weights as default
        if not self._weights_loaded:
            self.load_weights(self.cfg.pretrain_weights)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self._compose_callback.on_step_begin(self.status)

                # model forward
                self.model.train()
                outputs = self.model(data)
                loss = outputs['loss']

                # model backward
                loss.backward()
                self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)

            self._compose_callback.on_epoch_end(self.status)
Esempio n. 2
0
    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"
        Init_mark = False

        model = self.model
        if self.cfg.get('fleet', False):
            model = fleet.distributed_model(model)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            model = paddle.DataParallel(
                self.model, find_unused_parameters=find_unused_parameters)

        # initial fp16
        if self.cfg.get('fp16', False):
            scaler = amp.GradScaler(enable=self.cfg.use_gpu,
                                    init_loss_scaling=1024)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                        fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                       fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        if self.cfg.get('print_flops', False):
            self._flops(self.loader)
        profiler_options = self.cfg.get('profiler_options', None)

        self._compose_callback.on_train_begin(self.status)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            model.train()
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                profiler.add_profiler_step(profiler_options)
                self._compose_callback.on_step_begin(self.status)
                data['epoch_id'] = epoch_id

                if self.cfg.get('fp16', False):
                    with amp.auto_cast(enable=self.cfg.use_gpu):
                        # model forward
                        outputs = model(data)
                        loss = outputs['loss']

                    # model backward
                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    # model forward
                    outputs = model(data)
                    loss = outputs['loss']
                    # model backward
                    loss.backward()
                    self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                if self.cfg.get('unstructured_prune'):
                    self.pruner.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                if self.use_ema:
                    self.ema.update(self.model)
                iter_tic = time.time()

            # apply ema weight on model
            if self.use_ema:
                weight = copy.deepcopy(self.model.state_dict())
                self.model.set_dict(self.ema.apply())
            if self.cfg.get('unstructured_prune'):
                self.pruner.update_params()

            self._compose_callback.on_epoch_end(self.status)

            if validate and (self._nranks < 2 or self._local_rank == 0) \
                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                # if validation in training is enabled, metrics should be re-init
                # Init_mark makes sure this code will only execute once
                if validate and Init_mark == False:
                    Init_mark = True
                    self._init_metrics(validate=validate)
                    self._reset_metrics()
                with paddle.no_grad():
                    self.status['save_best_model'] = True
                    self._eval_with_loader(self._eval_loader)

            # restore origin weight on model
            if self.use_ema:
                self.model.set_dict(weight)

        self._compose_callback.on_train_end(self.status)
Esempio n. 3
0
    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"

        # if no given weights loaded, load backbone pretrain weights as default
        if not self._weights_loaded:
            self.load_weights(self.cfg.pretrain_weights)

        model = self.model
        if self._nranks > 1:
            model = paddle.DataParallel(self.model)
        else:
            model = self.model

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                        fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                       fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            model.train()
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self._compose_callback.on_step_begin(self.status)

                # model forward
                outputs = model(data)
                loss = outputs['loss']

                # model backward
                loss.backward()
                self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                iter_tic = time.time()

            self._compose_callback.on_epoch_end(self.status)

            if validate and (self._nranks < 2 or self._local_rank == 0) \
                    and (epoch_id % self.cfg.snapshot_epoch == 0 \
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                with paddle.no_grad():
                    self._eval_with_loader(self._eval_loader)
Esempio n. 4
0
def run(FLAGS, cfg, place):
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    if ParallelEnv().nranks > 1:
        paddle.distributed.init_parallel_env()

    # Data
    datasets = cfg.TrainDataset
    train_loader = create('TrainReader')(datasets, cfg['worker_num'])
    steps = len(train_loader)

    # Model
    model = create(cfg.architecture)

    # Optimizer
    lr = create('LearningRate')(steps)
    optimizer = create('OptimizerBuilder')(lr, model.parameters())

    # Init Model & Optimzer
    start_epoch = 0
    if FLAGS.weight_type == 'resume':
        start_epoch = load_weight(model, cfg.pretrain_weights, optimizer)
    else:
        load_pretrain_weight(model, cfg.pretrain_weights,
                             cfg.get('load_static_weights', False),
                             FLAGS.weight_type)

    if getattr(model.backbone, 'norm_type', None) == 'sync_bn':
        assert cfg.use_gpu and ParallelEnv(
        ).nranks > 1, 'you should use bn rather than sync_bn while using a single gpu'
    # sync_bn = (getattr(model.backbone, 'norm_type', None) == 'sync_bn' and
    #            cfg.use_gpu and ParallelEnv().nranks > 1)
    # if sync_bn:
    #     model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    # The parameter filter is temporary fix for training because of #28997
    # in Paddle.
    def no_grad(param):
        if param.name.startswith("conv1_") or param.name.startswith("res2a_") \
            or param.name.startswith("res2b_") or param.name.startswith("res2c_"):
            return True

    for param in filter(no_grad, model.parameters()):
        param.stop_gradient = True

    # Parallel Model
    if ParallelEnv().nranks > 1:
        model = paddle.DataParallel(model)

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)

    # Run Train
    end_epoch = int(cfg.epoch)
    batch_size = int(cfg['TrainReader']['batch_size'])
    total_steps = (end_epoch - start_epoch) * steps
    step_id = 0

    train_stats = stats.TrainingStats(cfg.log_iter)
    batch_time = stats.SmoothedValue(fmt='{avg:.4f}')
    data_time = stats.SmoothedValue(fmt='{avg:.4f}')

    end_time = time.time()
    space_fmt = ':' + str(len(str(steps))) + 'd'
    # Run Train
    for cur_eid in range(start_epoch, end_epoch):
        datasets.set_epoch(cur_eid)
        for iter_id, data in enumerate(train_loader):
            data_time.update(time.time() - end_time)
            # Model Forward
            model.train()
            outputs = model(data, mode='train')
            loss = outputs['loss']
            # Model Backward
            loss.backward()
            optimizer.step()
            curr_lr = optimizer.get_lr()
            lr.step()
            optimizer.clear_grad()

            batch_time.update(time.time() - end_time)
            if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
                train_stats.update(outputs)
                logs = train_stats.log()
                if iter_id % cfg.log_iter == 0:
                    eta_sec = (total_steps - step_id) * batch_time.global_avg
                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
                    ips = float(batch_size) / batch_time.avg
                    fmt = ' '.join([
                        'Epoch: [{}]',
                        '[{' + space_fmt + '}/{}]',
                        '{meters}',
                        'eta: {eta}',
                        'batch_cost: {btime}',
                        'data_cost: {dtime}',
                        'ips: {ips:.4f} images/s',
                    ])
                    fmt = fmt.format(cur_eid,
                                     iter_id,
                                     steps,
                                     meters=logs,
                                     eta=eta_str,
                                     btime=str(batch_time),
                                     dtime=str(data_time),
                                     ips=ips)
                    logger.info(fmt)
            step_id += 1
            end_time = time.time()  # after copy outputs to CPU.
        # Save Stage
        if (ParallelEnv().local_rank == 0 and \
            (cur_eid % cfg.snapshot_epoch) == 0) or (cur_eid + 1) == end_epoch:
            save_name = str(
                cur_eid) if cur_eid + 1 != end_epoch else "model_final"
            save_model(model, optimizer, save_dir, save_name, cur_eid + 1)
Esempio n. 5
0
    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"

        # if validation in training is enabled, metrics should be re-init
        if validate:
            self._init_metrics(validate=validate)
            self._reset_metrics()

        model = self.model
        if self.cfg.fleet:
            model = fleet.distributed_model(model)
            self.optimizer = fleet.distributed_optimizer(
                self.optimizer).user_defined_optimizer
        elif self._nranks > 1:
            model = paddle.DataParallel(self.model)

        # initial fp16
        if self.cfg.fp16:
            scaler = amp.GradScaler(enable=self.cfg.use_gpu,
                                    init_loss_scaling=1024)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                        fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                       fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            model.train()
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self._compose_callback.on_step_begin(self.status)

                if self.cfg.fp16:
                    with amp.auto_cast(enable=self.cfg.use_gpu):
                        # model forward
                        outputs = model(data)
                        loss = outputs['loss']

                    # model backward
                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    # model forward
                    outputs = model(data)
                    loss = outputs['loss']
                    # model backward
                    loss.backward()
                    self.optimizer.step()

                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                if self.use_ema:
                    self.ema.update(self.model)
                iter_tic = time.time()

            # apply ema weight on model
            if self.use_ema:
                weight = self.model.state_dict()
                self.model.set_dict(self.ema.apply())

            self._compose_callback.on_epoch_end(self.status)

            if validate and (self._nranks < 2 or self._local_rank == 0) \
                    and (epoch_id % self.cfg.snapshot_epoch == 0 \
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                with paddle.no_grad():
                    self.status['save_best_model'] = True
                    self._eval_with_loader(self._eval_loader)

            # restore origin weight on model
            if self.use_ema:
                self.model.set_dict(weight)