Ejemplo n.º 1
0
def _get_gluon_metrics(train_config):
    metrics_gluon = {
        'value_loss':
        metric.MSE(name='value_loss', output_names=['value_output']),
        'value_acc_sign':
        metric.create(acc_sign,
                      name='value_acc_sign',
                      output_names=['value_output'],
                      label_names=['value_label']),
    }
    if train_config.sparse_policy_label:
        # the default cross entropy only supports sparse labels
        metrics_gluon['policy_loss'] = metric.CrossEntropy(
            name='policy_loss',
            output_names=['policy_output'],
            label_names=['policy_label']),
        metrics_gluon['policy_acc'] = metric.Accuracy(
            axis=1,
            name='policy_acc',
            output_names=['policy_output'],
            label_names=['policy_label'])
    else:
        metrics_gluon['policy_loss'] = metric.create(
            cross_entropy,
            name='policy_loss',
            output_names=['policy_output'],
            label_names=['policy_label'])
        metrics_gluon['policy_acc'] = metric.create(
            acc_distribution,
            name='policy_acc',
            output_names=['policy_output'],
            label_names=['policy_label'])
    return metrics_gluon
Ejemplo n.º 2
0
    def validate_model(self, valid_iter, eval_metric):
        """
        Parameters
        ----------
        eval_metric - "accuracy", "ce" (CrossEntropy), "f1", "mae", "mse",
                      "rmse", "top_k_accuracy".
        """
        # res = self.train_module.score(eval_data, validation_metric)

        eval_metric_fn = metric.create(eval_metric)
        end_of_batch = False
        nbatch = 0
        while not end_of_batch:
            try:
                valid_batch = valid_iter.next()
                self.train_module.forward(valid_batch)
                self.train_module.update_metric(eval_metric_fn,
                                                valid_batch.label)
                for name, value in eval_metric_fn.get():
                    self.writer.add_scalar(tag="Validation " + name,
                                           value=value,
                                           global_step=nbatch)
                    print("Batch[%d] Validation-%s=%.3f" %
                          (nbatch, name, value))
            except Exception:
                end_of_batch = True
            nbatch += 1
Ejemplo n.º 3
0
def _get_mxnet_metrics(train_config):
    metrics_mxnet = [
        metric.MSE(name='value_loss',
                   output_names=['value_output'],
                   label_names=['value_label']),
        metric.CrossEntropy(name='policy_loss',
                            output_names=['policy_output'],
                            label_names=['policy_label']),
        metric.create(acc_sign,
                      name='value_acc_sign',
                      output_names=['value_output'],
                      label_names=['value_label']),
        metric.Accuracy(axis=1,
                        name='policy_acc',
                        output_names=['policy_output'],
                        label_names=['policy_label'])
    ]
    if train_config.use_wdl:
        metrics_mxnet.append(
            metric.CrossEntropy(name='wdl_loss',
                                output_names=['wdl_output'],
                                label_names=['wdl_label']))
        metrics_mxnet.append(
            metric.Accuracy(axis=1,
                            name='wdl_acc',
                            output_names=['wdl_output'],
                            label_names=['wdl_label']))
    if train_config.use_plys_to_end:
        metrics_mxnet.append(
            metric.MSE(name='plys_to_end_loss',
                       output_names=['plys_to_end_output'],
                       label_names=['plys_to_end_label']))
    return metrics_mxnet
Ejemplo n.º 4
0
    def score(self, X, eval_metric='acc', num_batch=None, batch_end_callback=None, reset=True):
        """Run the model on X and calculate the score with eval_metric
        Parameters
        ----------
        X : mxnet.DataIter
        eval_metric : metric.metric
            The metric for calculating score
        num_batch : int or None
            the number of batch to run. Go though all batches if None
        Returns
        -------
        s : float
            the final score
        """
        # setup metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        X = self._init_iter(X, None, is_train=False)
        if reset:
            X.reset()

        data_shapes = X.provide_data
        data_names = [x[0] for x in data_shapes]
        self._init_predictor(data_shapes)
        data_arrays = [self._pred_exec.arg_dict[name] for name in data_names]

        for i, batch in enumerate(X):
            if num_batch is not None and i == num_batch:
                break
            _load_data(batch, data_arrays)
            self._pred_exec.forward(is_train=False)
            eval_metric.update(batch.label, self._pred_exec.outputs)

            if batch_end_callback != None:
                batch_end_params = BatchEndParam(epoch=0,
                                                 nbatch=i,
                                                 eval_metric=eval_metric,
                                                 locals=locals())
                if isinstance(batch_end_callback, list):
                    for call in batch_end_callback:
                        call(batch_end_params)
                else:
                    batch_end_callback(batch_end_params)
        return eval_metric.get()[1]
Ejemplo n.º 5
0
    def network_backprop_setup(self, grad_req, arg_names, arg_shapes,
                               eval_metric):

        if grad_req != 'null':
            self.grad_params = {}
            for name, shape in zip(arg_names, arg_shapes):
                if not (name.endswith('data') or name.endswith("mean_face")
                        or name.endswith('cls_label')
                        or name.endswith('proj_weight')
                        or name.endswith('proj_label')
                        or name.endswith('ground_truth')
                        or name.endswith('ellipse_label')
                        or name.endswith("bbox_weight")):
                    self.grad_params[name] = mx.nd.zeros(shape, self.ctx)

        # setting the required optimizer
        self.optimizer = opt.create(self.optimizer,
                                    rescale_grad=1.0,
                                    **(self.kwargs))
        self.updater = get_updater(self.optimizer)
        eval_metric = metric.create(eval_metric)

        return eval_metric
Ejemplo n.º 6
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            validate_metric=None,
            work_load_list=None,
            epoch_end_callback=None,
            batch_end_callback=None,
            fixed_param_prefix=None,
            initializer=None,
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            optimizer=None,
            optimizer_params=None,
            begin_epoch=0,
            num_epoch=None,
            kvstore='device',
            teacher_modules=None):
        if type(teacher_modules) is not list:
            teacher_modules = [teacher_modules]
        self.module.bind(data_shapes=self.data_shapes,
                         label_shapes=self.label_shapes,
                         for_training=True)
        self.module.init_params(initializer=initializer,
                                arg_params=arg_params,
                                aux_params=aux_params,
                                allow_missing=allow_missing)
        self.module.init_optimizer(kvstore=kvstore,
                                   optimizer=optimizer,
                                   optimizer_params=optimizer_params)

        if validate_metric is None:
            validate_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        # training loop
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)
            while not end_of_batch:
                data_batch = next_data_batch

                if teacher_modules[0] is not None:
                    for teacher_module in teacher_modules:
                        teacher_module.forward(data_batch=data_batch,
                                               is_train=True)
                        transfer_label = teacher_module.get_outputs()
                        data_batch.label = data_batch.label + transfer_label
                self.module.forward(data_batch, is_train=True)
                self.module.backward()
                self.module.update()

                try:
                    next_data_batch = next(data_iter)
                except StopIteration:
                    end_of_batch = True

                self.module.update_metric(eval_metric, data_batch.label)

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            arg_params, aux_params = self.module.get_params()
            self.module.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)
            if eval_data:
                res = self.module.score(eval_data,
                                        validate_metric,
                                        score_end_callback=None,
                                        batch_end_callback=None,
                                        reset=True,
                                        epoch=epoch)
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            train_data.reset()
Ejemplo n.º 7
0
def run_training(alpha, queue):
    _, x_val, yv_val, yp_val, plys_to_end, _ = load_pgn_dataset(
        dataset_type='val', part_id=0, verbose=True, normalize=tc.normalize)
    if tc.discount != 1:
        yv_val *= tc.discount**plys_to_end

    if tc.select_policy_from_plane:
        val_iter = mx.io.NDArrayIter(
            {'data': x_val}, {
                'value_label': yv_val,
                'policy_label': np.array(FLAT_PLANE_IDX)[yp_val.argmax(axis=1)]
            }, tc.batch_size)
    else:
        val_iter = mx.io.NDArrayIter({'data': x_val}, {
            'value_label': yv_val,
            'policy_label': yp_val.argmax(axis=1)
        }, tc.batch_size)

    tc.nb_parts = len(glob.glob(main_config['planes_train_dir'] + '**/*'))

    nb_it_per_epoch = (
        len(x_val) * tc.nb_parts
    ) // tc.batch_size  # calculate how many iterations per epoch exist
    # one iteration is defined by passing 1 batch and doing backprop
    tc.total_it = int(nb_it_per_epoch * tc.nb_training_epochs)

    ### Define a Learning Rate schedule
    to.lr_schedule = OneCycleSchedule(start_lr=tc.max_lr / 8,
                                      max_lr=tc.max_lr,
                                      cycle_length=tc.total_it * .3,
                                      cooldown_length=tc.total_it * .6,
                                      finish_lr=tc.min_lr)
    to.lr_schedule = LinearWarmUp(to.lr_schedule,
                                  start_lr=tc.min_lr,
                                  length=tc.total_it / 30)

    ### Momentum schedule
    to.momentum_schedule = MomentumSchedule(to.lr_schedule, tc.min_lr,
                                            tc.max_lr, tc.min_momentum,
                                            tc.max_momentum)
    plot_schedule(to.momentum_schedule,
                  iterations=tc.total_it,
                  ylabel='Momentum')

    input_shape = x_val[0].shape

    beta = np.sqrt(2 / alpha)

    print("alpha:", alpha)
    print("beta:", beta)

    depth = int(round(base_depth * alpha))
    channels = int(round(base_channels * beta))

    kernels = [3] * depth
    se_types = [None] * len(kernels)
    channels_reduced = int(round(channels / 4))

    symbol = rise_mobile_v3_symbol(channels=channels,
                                   channels_operating_init=channels_reduced,
                                   act_type='relu',
                                   channels_value_head=8,
                                   value_fc_size=256,
                                   channels_policy_head=NB_POLICY_MAP_CHANNELS,
                                   grad_scale_value=tc.val_loss_factor,
                                   grad_scale_policy=tc.policy_loss_factor,
                                   dropout_rate=tc.dropout_rate,
                                   select_policy_from_plane=True,
                                   kernels=kernels,
                                   se_types=se_types)

    # create a trainable module on compute context
    model = mx.mod.Module(symbol=symbol,
                          context=ctx,
                          label_names=['value_label', 'policy_label'])
    model.bind(for_training=True,
               data_shapes=[('data', (tc.batch_size, input_shape[0],
                                      input_shape[1], input_shape[2]))],
               label_shapes=val_iter.provide_label)
    model.init_params(
        mx.initializer.Xavier(rnd_type='uniform',
                              factor_type='avg',
                              magnitude=2.24))

    metrics_mxnet = [
        metric.MSE(name='value_loss',
                   output_names=['value_output'],
                   label_names=['value_label']),
        metric.CrossEntropy(name='policy_loss',
                            output_names=['policy_output'],
                            label_names=['policy_label']),
        metric.create(acc_sign,
                      name='value_acc_sign',
                      output_names=['value_output'],
                      label_names=['value_label']),
        metric.Accuracy(axis=1,
                        name='policy_acc',
                        output_names=['policy_output'],
                        label_names=['policy_label'])
    ]

    to.metrics = metrics_mxnet
    train_agent = TrainerAgentMXNET(model,
                                    symbol,
                                    val_iter,
                                    tc,
                                    to,
                                    use_rtpt=True)
    print("model.score(val_iter, to.metrics:",
          model.score(val_iter, to.metrics))

    # Start the training process
    _, (k_steps_best, val_metric_values_best) = train_agent.train(cur_it)

    new_row = {
        'alpha': alpha,
        'beta': beta,
        'depth': depth,
        'channels': channels,
        'k_steps_best': k_steps_best,
        'val_loss': val_metric_values_best['loss'],
        'val_value_loss': val_metric_values_best['value_loss'],
        'val_policy_loss': val_metric_values_best['policy_loss'],
        'val_policy_acc': val_metric_values_best['policy_acc'],
        'val_value_acc': val_metric_values_best['value_acc_sign']
    }

    queue.put(new_row)
    print(new_row)
Ejemplo n.º 8
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None):
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind)
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)
        ####chris_arg
        if int(os.getenv("TASK_LIMIT",
                         0)) != 0:  #为0时不分task限制,为1时分task但是每轮更新,为2时分task并但固定
            get_task_cmd = "sh /home/ubuntu/tc.sh -l 1"
        else:
            self.logger.info("no_task_bandwidth_limit")
            get_task_cmd = "sh /home/ubuntu/tc.sh -l 0"
        os.system(get_task_cmd)
        delay_time = float(os.getenv("DELAY_TIME", 0.8))
        ps_upload_bandwidth_part1 = int(os.getenv("PS_UPLOAD_BANDWIDTH1",
                                                  2000))
        worker_upload_bandwidth_part1 = int(
            os.getenv("WORKER_UPLOAD_BANDWIDTH1", 2000))
        ps_upload_bandwidth_part2 = int(os.getenv("PS_UPLOAD_BANDWIDTH2",
                                                  2000))
        worker_upload_bandwidth_part2 = int(
            os.getenv("WORKER_UPLOAD_BANDWIDTH2", 2000))
        tc_command = "sudo tc class change dev {} parent 1: classid 1:3 htb rate {}mbit ceil {}mbit  && sudo tc class change dev {} parent 1: classid 1:4 htb rate {}mbit ceil {}mbit"
        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)
            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                self.forward(data_batch, is_train=True)
                if int(os.getenv("TASK_LIMIT", 0)) == 1:
                    ##first part bandwidth allocation
                    ndarray.waitall()
                    # self.logger.info("change bandwidth part1:, "+str(time.time()))
                    x = str(ps_upload_bandwidth_part1)
                    y = str(worker_upload_bandwidth_part1)
                    cmd_up = tc_command.format("ens3", x, x, "ens3", y, y)
                    cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x)
                    os.system(cmd_up)
                    # os.system(cmd_down)
                # self.logger.info("after forward, "+str(time.time()))
                self.backward()
                # self.logger.info("before update: "+str(time.time()))
                self.update()  #异步执行的
                if int(os.getenv("TASK_LIMIT", 0)) == 1:
                    x = str(ps_upload_bandwidth_part2)
                    y = str(worker_upload_bandwidth_part2)
                    cmd_up = tc_command.format("ens3", x, x, "ens3", y, y)
                    cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x)
                    time.sleep(delay_time)
                    ##second part bandwidth allocation
                    # self.logger.info("change bandwidth part2:, "+str(time.time()))
                    os.system(cmd_up)
                    # os.system(cmd_down)
                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 9
0
    def fit(self,
            train_data,
            eval_data,
            eval_metric='mse',
            grad_req='write',
            epoch_end_callback=None,
            batch_end_callback=None,
            kv_store='local',
            logger=None):

        if logger is None:
            logger = logging
        logging.info('Starting training with %s', str(self.ctx))
        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(
            data=train_data.provide_data[0][1])
        arg_names = self.symbol.list_arguments()

        if grad_req != 'null':
            self.grad_params = {}
            for name, shape in zip(arg_names, arg_shapes):
                if not (name.endswith('data') or name.endswith('label')):
                    self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
        #init the params
        pdb.set_trace()
        self.arg_params = {
            k: mx.nd.zeros(s, self.ctx)
            for k, s in zip(arg_names, arg_shapes)
        }
        for k, v in self.arg_params.items():
            if not (k.endswith('data') or k.endswith('label')):
                self.initializer(k, v)

        #init the aux params
        aux_names = self.symbol.list_auxiliary_states()
        self.aux_params = {
            k: mx.nd.zeros(s, self.ctx)
            for k, s in zip(aux_names, aux_shapes)
        }

        data_name = train_data.data_name
        label_name = train_data.label_name
        input_names = [data_name, label_name]

        self.optimizer = mx.optimizer.create(
            self.optimizer,
            rescale_grad=(1.0 / train_data.get_batch_size()),
            **(self.kwargs))
        self.updater = mx.optimizer.get_updater(self.optimizer)

        eval_metric = metric.create(eval_metric)

        # begin training
        for epoch in range(self.begin_epoch, self.num_epoch):
            nbatch = 0
            train_data.reset()
            eval_metric.reset()

            #train
            for databatch in train_data:
                nbatch += 1
                for k, v in databatch.data.items():
                    self.arg_params[k] = mx.nd.array(v, self.ctx)
                for k, v in databatch.label.items():
                    self.arg_params[k] = mx.nd.array(v, self.ctx)
                executor = self.symbol.bind(self.ctx,
                                            self.arg_params,
                                            args_grad=self.grad_params,
                                            grad_req=grad_req,
                                            aux_states=self.aux_params)
                # print(nbatch)
                if nbatch == 1550:
                    pdb.set_trace()
                update_dict = {
                    name: nd
                    for name, nd in zip(self.symbol.list_arguments(),
                                        executor.grad_arrays) if nd
                }
                output_dict = {
                    name: nd
                    for name, nd in zip(self.symbol.list_outputs(),
                                        executor.outputs)
                }
                # pdb.set_trace()
                executor.forward(is_train=True)
                executor.backward()

                for key, arr in update_dict.items():
                    self.updater(key, arr, self.arg_params[key])

                label = self.arg_params['lr_label']
                pred = output_dict['lr_output']
                eval_metric.update([label], [pred])
                executor.outputs[0].wait_to_read()
                batch_end_params = BatchEndParam(epoch=epoch,
                                                 nbatch=nbatch,
                                                 eval_metric=eval_metric)
                batch_end_callback(batch_end_params)
            if epoch_end_callback != None:
                epoch_end_callback(epoch, self.symbol, self.arg_params,
                                   self.aux_params)
            # pdb.set_trace()
            name, value = eval_metric.get()
            logger.info("------------------------------>Epoch[%d] Train-%s=%f",
                        epoch, name, value)

            #begin evaluation
            if eval_data:
                logger.info("in eval process...")
                nbatch = 0
                eval_data.reset()
                eval_metric.reset()
                for data in eval_data:
                    nbatch += 1
                    for k, v in databatch.data.items():
                        self.arg_params[k] = mx.nd.array(v, self.ctx)
                    for k, v in databatch.label.items():
                        self.arg_params[k] = mx.nd.array(v, self.ctx)
                    executor = self.symbol.bind(self.ctx,
                                                self.arg_params,
                                                args_grad=self.grad_params,
                                                grad_req=grad_req,
                                                aux_states=self.aux_params)

                    output_dict = {
                        name: nd
                        for name, nd in zip(self.symbol.list_outputs(),
                                            executor.outputs)
                    }
                    executor.forward(is_train=False)
                    label = self.arg_params['lr_label']
                    pred = output_dict['lr_output']
                    eval_metric.update([label], [pred])
            name, value = eval_metric.get()
            logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
Ejemplo n.º 10
0
    def fit(
            self,
            train_data,
            ogdb,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(
                ('learning_rate',
                 0.01), ),  #,('rescale_grad', 1.0/8.0),), #8 gpu attempt
            eval_end_callback=None,
            iter_size=1,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None):
        """Ke's revision: add iter_size. Trains the module parameters.

        Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see
        a end-to-end use-case.

        Parameters
        ----------
        train_data : DataIter
            Train DataIter.
        eval_data : DataIter
            If not ``None``, will be used as validation set and the performance
            after each epoch will be evaluated.
        eval_metric : str or EvalMetric
            Defaults to 'accuracy'. The performance measure used to display during training.
            Other possible predefined metrics are:
            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
        epoch_end_callback : function or list of functions
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Defaults to 'local'.
        optimizer : str or Optimizer
            Defaults to 'sgd'.
        optimizer_params : dict
            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
            the optimizer constructor.
            The default value is not a dict, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each mini-batch during evaluation.
        initializer : Initializer
            The initializer is called to initialize the module parameters when they are
            not already initialized.
        arg_params : dict
            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has a higher priority than `initializer`.
        aux_params : dict
            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Defaults to ``False``. Whether to force rebinding the executors if already bound.
        force_init : bool
            Defaults to ``False``. Indicates whether to force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
            checkpoint saved at a previous training phase at epoch N, then this value should be
            N+1.
        num_epoch : int
            Number of epochs for training.

        Examples
        --------
        >>> # An example of using fit for training.
        >>> # Assume training dataIter and validation dataIter are ready
        >>> # Assume loading a previously checkpointed model
        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)
        >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd',
        ...     optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
        ...     arg_params=arg_params, aux_params=aux_params,
        ...     eval_metric='acc', num_epoch=10, begin_epoch=3)
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind,
                  grad_req='add')
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        annealing_steps = 0  # number of current annealing steps in current epoch
        redo_training = 0  # Flag to redo training / resample
        val_list = []  # list of validation results per annealing step
        cur_val = 0
        target_prec = 50
        #Note: we want to identify the best cluster of images / training sets with a low percentage
        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            if redo_training:
                annealing_steps = annealing_steps + 1
                self.logger.info('Redoing training to meet criteria = %d',
                                 annealing_steps)
                #sroidb = train_data.roidb #passthrough test

                atick = time.time()

                iterdiff = 1.0
                # Check if we've stagnated
                if len(val_list) > 2:
                    itermean = (val_list[-1] + val_list[-2] + val_list[-3]) / 3
                    iterdiff = abs(itermean - val_list[-1])
                    self.logger.info('Last 3 samples have diff of: %f',
                                     iterdiff)

                if iterdiff < 0.01:
                    self.logger.info(
                        'Reached a stagnated annealing criteria, dumping current samples'
                    )
                    # Do something drastic
                    # Lets try to instantly use the original db
                    sroidb = ogdb

                    # Try to read in another random subset
                    #sroidb = sample_roidb(ogdb, 25) # Sample with removal
                else:
                    # Continue as usual
                    # Select a new random subset
                    newroidb = sample_roidb(ogdb,
                                            15)  # Without removal, this is 10%

                    # Append old with new
                    sroidb = append_roidb(train_data.roidb, newroidb)

                # Create new training data instance by passing most of previous arguments and new random db
                train_data2 = AnchorLoader(
                    train_data.feat_sym,
                    sroidb,
                    train_data.batch_size,
                    train_data.shuffle,
                    train_data.ctx,
                    train_data.work_load_list,
                    train_data.feat_stride,
                    train_data.anchor_scales,
                    train_data.anchor_ratios,
                    train_data.aspect_grouping,
                    nThreads=default.prefetch_thread_num)

                # Overwrite old train_data with the new one
                train_data = train_data2
                data_iter = iter(train_data)

                atock = time.time()
                self.logger.info('Annealing[%d] Time cost=%.3f',
                                 annealing_steps, (atock - atick))
            else:
                data_iter = iter(train_data)
                annealing_steps = 0
                val_list = []
                #target_prec=cur_val+5
                target_prec = target_prec + 5
            end_of_batch = False
            next_data_batch = next(data_iter)

            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                # self.forward_backward(data_batch)
                self.forward(data_batch, is_train=True, grad_req='add')
                self.backward()
                if nbatch % iter_size == 0:  # update every iter_size batches
                    self.update()
                    for g in self._curr_module._exec_group.grad_arrays:
                        for g1 in g:
                            if g1 is not None:
                                g1[:] = 0.

                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
            #print('Epoch[%d] Time cost=%.3f', epoch, (toc-tic))
            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    cur_val = callback(epoch, self.symbol, arg_params,
                                       aux_params)

            self.logger.info('Returned Validation=%f', val)
            val_list.append(val)
            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                self.logger.info('Evaluating data')
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            #----------
            # Check epoch if it falls within the validation threshold
            if cur_val < target_prec:
                # Evaluate list of precision/validation results first
                #val_list
                print(eval_data)

                #else
                redo_training = 1
                self.logger.info('Retraining data=%f', val)
            else:
                redo_training = 0

            self.logger.info('Annealing steps=%f', annealing_steps)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 11
0
    def fit(self,
            X,
            marks,
            e_marks=None,
            y=None, eval_data=None, eval_metric='acc',
            epoch_end_callback=None, batch_end_callback=None, time_step_callback=None,
            kvstore='local', logger=None,
            work_load_list=None, monitor=None, eval_batch_end_callback=None):
        """Overwrite"""

        data = self._init_iter(X, y, is_train=True)
        eval_data = self._init_eval_iter(eval_data)

        if self.sym_gen:
            self.symbol = self.sym_gen(
                data.default_bucket_key)  # pylint: disable=no-member
            self._check_arguments()
        self.kwargs["sym"] = self.symbol

        param_dict = dict(data.provide_data + data.provide_label)
        arg_names, param_names, aux_names = self._init_params(param_dict)

        # setup metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        # create kvstore
        (kvstore, update_on_kvstore) = _create_kvstore(
            kvstore, len(self.ctx), self.arg_params)

        param_idx2name = {}
        if update_on_kvstore:
            param_idx2name.update(enumerate(param_names))
        else:
            for i, n in enumerate(param_names):
                for k in range(len(self.ctx)):
                    param_idx2name[i * len(self.ctx) + k] = n
        self.kwargs["param_idx2name"] = param_idx2name

        # init optmizer
        if isinstance(self.optimizer, str):
            batch_size = data.batch_size
            if kvstore and kvstore.type == 'dist_sync':
                batch_size *= kvstore.num_workers
            optimizer = opt.create(self.optimizer,
                                   rescale_grad=(1.0 / batch_size),
                                   **(self.kwargs))
        elif isinstance(self.optimizer, opt.Optimizer):
            optimizer = self.optimizer

        # do training
        _train_rnn(self.symbol, self.ctx,
                   marks,
                   arg_names, param_names, aux_names,
                   self.arg_params, self.aux_params,
                   begin_epoch=self.begin_epoch, end_epoch=self.num_epoch,
                   epoch_size=self.epoch_size,
                   optimizer=optimizer,
                   train_data=data, eval_data=eval_data,
                   eval_metric=eval_metric,
                   epoch_end_callback=epoch_end_callback,
                   batch_end_callback=batch_end_callback,
                   time_step_callback=time_step_callback,
                   kvstore=kvstore, update_on_kvstore=update_on_kvstore,
                   logger=logger, work_load_list=work_load_list, monitor=monitor,
                   eval_batch_end_callback=eval_batch_end_callback,
                   sym_gen=self.sym_gen, e_marks=e_marks)
Ejemplo n.º 12
0
    aux_names = network.list_auxiliary_states()
    aux_params = {
        k: mx.nd.zeros(s, ctx)
        for k, s in zip(aux_names, aux_shapes)
    }

    # prepare optimizer
    optimizer = opt.create('adam',
                           rescale_grad=(1.0 / dataiter.get_batch_size()),
                           **({
                               'learning_rate': 0.01
                           }))
    updater = get_updater(optimizer)

    # create eval_metrix
    eval_metric = metric.create('rmse')

    data_name = dataiter.data_name
    label_name = dataiter.label_name
    arg_params = network_args
    aux_params = network_auxs

    batch_callback = mx.callback.Speedometer(1, 10)
    epoch_callback = mx.callback.do_checkpoint(save_model_prefix)

    # begin training
    for epoch in range(10000):
        nbatch = 0
        dataiter.reset()
        eval_metric.reset()
        for data in dataiter:
Ejemplo n.º 13
0
def main():
    # set debug
    DEBUG = False
    # =============setting============
    dataset = config.dataset.dataset
    batch_size = config.TRAIN.BATCH_SIZE
    lr = config.TRAIN.lr
    beta1 = config.TRAIN.beta1
    sigma = 0.02
    ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')]
    assert len(ctx) == 1, 'Multi GPU not supported.'
    ctx = ctx[0]
    frequent = config.default.frequent
    check_point = True

    logger, final_output_path = create_logger(config.output_path, args.cfg)
    prefix = os.path.join(final_output_path, config.TRAIN.model_prefix)
    train_fig_path = os.path.join(final_output_path, 'train_fig')

    train_fig_prefix = os.path.join(train_fig_path, dataset)

    if not os.path.exists(train_fig_path):
        os.makedirs(train_fig_path)

    # set random seed for reproducibility
    mx.random.seed(config.RNG_SEED)
    np.random.seed(config.RNG_SEED)

    # ==============data==============
    #train_data = pix2pixIter(config, shuffle=True, ctx=ctx)
    train_data = DataIter(config,ctx=ctx)

    step = config.TRAIN.step_epoch * train_data.size / batch_size
    step_decay = config.TRAIN.decay_epoch * train_data.size / batch_size
    if config.TRAIN.end_epoch == (config.TRAIN.step_epoch + config.TRAIN.decay_epoch):
        lr_scheduler_g = PIX2PIXScheduler(step=int(step), step_decay=int(step_decay), base_lr=lr)
        lr_scheduler_d = PIX2PIXScheduler(step=int(step), step_decay=int(step_decay), base_lr=lr/2.0)
    else:
        lr_scheduler_g = None
        lr_scheduler_d = None

    label = mx.nd.zeros((batch_size,), ctx=ctx)

    # print config
    pprint.pprint(config)
    logger.info('system:{}'.format(os.uname()))
    logger.info('mxnet path:{}'.format(mx.__file__))
    logger.info('rng seed:{}'.format(config.RNG_SEED))
    logger.info('training config:{}\n'.format(pprint.pformat(config)))

    # =============Generator Module=============
    if batch_size == 1:
        if config.netG == 'autoencoder':
            generatorSymbol = defineG_encoder_decoder(config)
        elif config.netG == 'unet':
            generatorSymbol = defineG_unet(config)
        else:
            raise NotImplemented
    else:
        if config.netG == 'autoencoder':
            generatorSymbol = defineG_encoder_decoder_batch(config)
        elif config.netG == 'unet':
            generatorSymbol = defineG_unet_batch(config)
        else:
            raise NotImplemented

    if DEBUG:
        generatorGroup = generatorSymbol.get_internals()
        name_list = generatorGroup.list_outputs()
        out_name = []
        for name in name_list:
            if 'output' in name:
                out_name += [generatorGroup[name]]
        out_group = mx.sym.Group(out_name)
        out_shapes = out_group.infer_shape(A=(4, 3, 256, 256))

    generator = mx.mod.Module(symbol=generatorSymbol, data_names=('A', 'B',), label_names=None, context=ctx)
    generator.bind(data_shapes=train_data.provide_data)
    #draw network
    #network_test(generatorSymbol)
    # init params
    arg_params = {}
    aux_params = {}
    arg_names = generatorSymbol.list_arguments()
    aux_names = generatorSymbol.list_auxiliary_states()
    arg_shapes, _, aux_shapes = generatorSymbol.infer_shape(A = train_data.provide_data[0][1],
                                                            B = train_data.provide_data[1][1])

    if batch_size == 1:
        for idx, arg_name in enumerate(arg_names):
            if 'weight' in arg_name:
                arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx])
            elif 'gamma' in arg_name:
                arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx])
            elif 'bias' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            elif 'beta' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            else:
                # raise NameError('Unknown parameter name.')
                pass
    else:
        for idx, arg_name in enumerate(arg_names):
            if 'weight' in arg_name:
                arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx])
            elif 'gamma' in arg_name:
                arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx])
            elif 'bias' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            elif 'beta' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            else:
                # raise NameError('Unknown parameter name.')
                pass
        for idx, aux_name in enumerate(aux_names):
            if 'mean' in aux_name:
                aux_params[aux_name] = mx.nd.zeros(shape=aux_shapes[idx])
            elif 'var' in aux_name:
                aux_params[aux_name] = mx.nd.ones(shape=aux_shapes[idx])
            else:
                raise NameError('Unknown aux_name.')

    generator.init_params(arg_params=arg_params, aux_params=aux_params)

    if lr_scheduler_g is not None:
        generator.init_optimizer(
            optimizer='adam',
            optimizer_params={
                'learning_rate': lr,
                'lr_scheduler': lr_scheduler_g,
                'beta1': beta1,
                'rescale_grad': 1.0/batch_size
            })
    else:
        generator.init_optimizer(
            optimizer='adam',
            optimizer_params={
                'learning_rate': lr,
                'beta1': beta1,
                'rescale_grad': 1.0/batch_size
            })
    mods = [generator]

    # =============Discriminator Module=============
    if batch_size == 1:
        if config.netD == 'basic':
            discriminatorSymbol = defineD_basic()
        elif config.netD == 'n_layers':
            discriminatorSymbol = defineD_n_layers(n_layers = config.n_layers)
        else:
            raise NotImplemented
    else:
        if config.netD == 'basic':
            discriminatorSymbol = defineD_basic_batch(batch_size=batch_size)
        elif config.netD == 'n_layers':
            discriminatorSymbol = defineD_n_layers_batch(n_layers = config.n_layers, batch_size=batch_size)
        else:
            raise NotImplemented

    if DEBUG:
        generatorGroup = discriminatorSymbol.get_internals()
        name_list = generatorGroup.list_outputs()
        out_name = []
        for name in name_list:
            if 'output' in name:
                out_name += [generatorGroup[name]]
        out_group = mx.sym.Group(out_name)
        out_shapes = out_group.infer_shape(A=(1, 3, 256, 256), B=(1, 3, 256, 256))

    discriminator = mx.mod.Module(symbol=discriminatorSymbol, data_names=('A', 'B',), label_names=('label',), context=ctx)
    discriminator.bind(data_shapes=train_data.provide_data,
                       label_shapes=[('label', (batch_size,))],
                       inputs_need_grad=True)

    # init params
    arg_params = {}
    aux_params = {}
    arg_names = discriminatorSymbol.list_arguments()
    aux_names = discriminatorSymbol.list_auxiliary_states()
    arg_shapes, _, aux_shapes = discriminatorSymbol.infer_shape(A=train_data.provide_data[0][1],
                                                                B=train_data.provide_data[1][1],
                                                                label=(batch_size,))

    if batch_size == 1:
        for idx, arg_name in enumerate(arg_names):
            if 'weight' in arg_name:
                arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx])
            elif 'gamma' in arg_name:
                arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx])
            elif 'bias' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            elif 'beta' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            else:
                # raise NameError('Unknown parameter name.')
                pass
    else:
        for idx, arg_name in enumerate(arg_names):
            if 'weight' in arg_name:
                arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx])
            elif 'gamma' in arg_name:
                arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx])
            elif 'bias' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            elif 'beta' in arg_name:
                arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx])
            else:
                # raise NameError('Unknown parameter name.')
                pass
        for idx, aux_name in enumerate(aux_names):
            if 'mean' in aux_name:
                aux_params[aux_name] = mx.nd.zeros(shape=aux_shapes[idx])
            elif 'var' in aux_name:
                aux_params[aux_name] = mx.nd.ones(shape=aux_shapes[idx])
            else:
                raise NameError('Unknown aux_name.')

    discriminator.init_params(arg_params=arg_params, aux_params=aux_params)

    # gradient is scaled in LogisticRegression layer, no need to rescale gradient
    if lr_scheduler_d is not None:
        discriminator.init_optimizer(
            optimizer='adam',
            optimizer_params={
                'learning_rate': lr / 2.0,
                'lr_scheduler': lr_scheduler_d,
                'beta1': beta1,
                'rescale_grad': 1.0
            })
    else:
        discriminator.init_optimizer(
            optimizer='adam',
            optimizer_params={
                'learning_rate': lr / 2.0,
                'beta1': beta1,
                'rescale_grad': 1.0
            })
    mods.append(discriminator)

    #load the trained model
    import symbols.loss_layer.lsoftmax
    save_model_prefix = '/home/zhengxiawu/project/FGIR-GAN/trained_model/Resnet_lsoftmax'
    tag = 0
    trained_sym, trained_arg_params, trained_aux_params = \
        mx.model.load_checkpoint(save_model_prefix, tag)
    train_model = mx.mod.Module(symbol=trained_sym,data_names=('data',),label_names=('label',),
                                context=ctx)
    train_model.bind(data_shapes=[('data',(batch_size,3,256,256))],
                       label_shapes=[('label', (batch_size,))],
                       inputs_need_grad=True)
    train_model.init_params(arg_params=trained_arg_params,aux_params=trained_aux_params)

    # metric
    mG = metric.CrossEntropyMetric()
    mD = metric.CrossEntropyMetric()
    mACC = metric.AccMetric()
    mL1 = metric.L1LossMetric(config)
    mTrained = mx_metric.create(['accuracy'])

    t_accumulate = 0

    # =============train===============
    for epoch in range(config.TRAIN.end_epoch):
        train_data.reset()
        mACC.reset()
        mG.reset()
        mD.reset()
        mL1.reset()
        mTrained.reset()
        for t, batch in enumerate(train_data):

            t_start = time.time()

            # generator input real A, output fake B
            generator.forward(batch, is_train=True)
            outG = generator.get_outputs()

            #put into trained model
            train_model.forward(mx.io.DataBatch([outG[1]*(255.0/2.0)],batch.label),is_train=True)
            train_model.backward()
            diffT = train_model.get_input_grads()
            train_model.update_metric(mTrained,batch.label)
            generator.backward([mx.nd.array(np.ones((batch_size,)), ctx=ctx), diffT[0] * config.Trained_model_loss * (255.0/2.0)])
            generator.update()


            # update discriminator on fake
            # discriminator input real A and fake B
            # want discriminator to predict fake (0)
            label[:] = 0
            discriminator.forward(mx.io.DataBatch([batch.data[0], outG[1]], [label]), is_train=True)
            discriminator.backward()
            gradD = [[grad.copyto(grad.context) for grad in grads] for grads in discriminator._exec_group.grad_arrays]

            discriminator.update_metric(mD, [label])
            discriminator.update_metric(mACC, [label])

            # update discriminator on real
            # discriminator input real A and real B
            # want discriminator to predict real (1)
            label[:] = 1
            batch.label = [label]
            discriminator.forward(batch, is_train=True)
            discriminator.backward()
            for gradsr, gradsf in zip(discriminator._exec_group.grad_arrays, gradD):
                for gradr, gradf in zip(gradsr, gradsf):
                    # gradr =  (gradr + gradf)/2
                    gradr += gradf
            discriminator.update()

            discriminator.update_metric(mD, [label])
            discriminator.update_metric(mACC, [label])

            # update generator
            # discriminator input real A and fake B
            # want discriminator to predict real (1)
            label[:] = 1
            discriminator.forward(mx.io.DataBatch([batch.data[0], outG[1]], [label]), is_train=True)
            discriminator.backward()
            diffD = discriminator.get_input_grads()
            # loss does not need output gradient
            generator.backward([mx.nd.array(np.ones((batch_size,)), ctx=ctx), diffD[1] * config.GAN_loss])
            generator.update()

            mG.update([label], discriminator.get_outputs())
            mL1.update(None, outG)

            t_accumulate += time.time() - t_start

            t += 1
            if t % frequent == 0:
                if config.TRAIN.batch_end_plot_figure:
                    visualize(batch.data[0].asnumpy(), batch.data[1].asnumpy(), outG[1].asnumpy(), train_fig_prefix + '-train-%04d-%06d.png' % (epoch + 1, t))
                #a = mTrained.get()
                print 'Epoch[{}] Batch[{}] Time[{:.4f}] dACC: {:.4f} gCE: {:.4f} dCE: {:.4f} gL1: {:.4f} tAcc: {:.4f}'.format(epoch, t, t_accumulate, mACC.get()[1], mG.get()[1], mD.get()[1], mL1.get()[1],mTrained.get()[1][0])
                logger.info('Epoch[{}] Batch[{}] Speed[{:.4f} batch/s] dACC: {:.4f} gCE: {:.4f} dCE: {:.4f} gL1: {:.4f}\n'.format(epoch, t, frequent * batch_size / t_accumulate, mACC.get()[1], mG.get()[1], mD.get()[1], mL1.get()[1]))
                t_accumulate = 0

        if check_point:
            print('Saving...')
            if config.TRAIN.epoch_end_plot_figure:
                visualize(batch.data[0].asnumpy(), batch.data[1].asnumpy(), outG[1].asnumpy(),
                          train_fig_prefix + '-train-%04d.png' % (epoch + 1))
            if (epoch + 1) % config.TRAIN.save_interval == 0:
                generator.save_params(prefix + '-generator-%04d.params' % (epoch + 1))
                discriminator.save_params(prefix + '-discriminator-%04d.params' % (epoch + 1))

    generator.save_params(prefix + '-generator-%04d.params' % config.TRAIN.end_epoch)
    discriminator.save_params(prefix + '-discriminator-%04d.params' % config.TRAIN.end_epoch)
Ejemplo n.º 14
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            validate_metric=None,
            work_load_list=None,
            epoch_end_callback=None,
            batch_end_callback=None,
            fixed_param_prefix=None,
            initializer=None,
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            optimizer=None,
            optimizer_params=None,
            begin_epoch=0,
            num_epoch=None,
            kvstore='device'):

        self.module.bind(data_shapes=self.data_shapes,
                         label_shapes=self.label_shapes,
                         for_training=True)
        self.module.init_params(initializer=initializer,
                                arg_params=arg_params,
                                aux_params=aux_params,
                                allow_missing=allow_missing)
        self.module.init_optimizer(kvstore=kvstore,
                                   optimizer=optimizer,
                                   optimizer_params=optimizer_params)

        if validate_metric is None:
            validate_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        temp_count = 0

        # # test model size by saving params of model
        # arg_params, aux_params = self.module.get_params()
        # for callback in _as_list(epoch_end_callback):
        #     callback(0, self.symbol, arg_params, aux_params)
        # raise NotImplementedError

        # training loop
        for epoch in range(begin_epoch, num_epoch):

            train_time = AverageMeter()
            kvstore_sync_time = AverageMeter()
            get_data_time = AverageMeter()
            iter_total_time = AverageMeter()

            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)
            while not end_of_batch:
                start_time = time.time()
                data_batch = next_data_batch

                self.module.forward(data_batch, is_train=True)
                self.module.backward()

                # ndarray.waitall()
                train_time.update(time.time() - start_time)

                self.module.update()

                # ndarray.waitall()
                kvstore_sync_time.update(time.time() - start_time)

                try:
                    next_data_batch = next(data_iter)
                except StopIteration:
                    end_of_batch = True

                # ndarray.waitall()
                get_data_time.update(time.time() - start_time)

                if isinstance(data_batch, list):
                    self.module.update_metric(eval_metric,
                                              [db.label for db in data_batch],
                                              pre_sliced=True)
                else:
                    self.module.update_metric(eval_metric, data_batch.label)

                # ndarray.waitall()
                iter_total_time.update(time.time() - start_time)

                if batch_end_callback is not None:
                    # batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
                    #                                  eval_metric=eval_metric,
                    #                                  locals=locals())

                    batch_end_params = BatchEndParam(
                        epoch=epoch,
                        nbatch=nbatch,
                        eval_metric=eval_metric,
                        locals=locals(),
                        rank=kvstore.rank,
                        total_iter=temp_count,
                        cur_data_time=get_data_time.val,
                        avg_data_time=get_data_time.avg,
                        cur_batch_time=train_time.val,
                        avg_batch_time=train_time.avg,
                        cur_kvstore_sync_time=kvstore_sync_time.val,
                        avg_kvstore_sync_time=kvstore_sync_time.avg,
                        cur_iter_total_time=iter_total_time.val,
                        avg_iter_total_time=iter_total_time.avg)
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1
                temp_count += 1

            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            arg_params, aux_params = self.module.get_params()
            self.module.set_params(arg_params, aux_params)

            if epoch_end_callback is not None and kvstore.rank == 0:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)
            if eval_data:
                if self.config.network == 'mobilenet_int8_foldbn':
                    # for fold bn to create inference symbol
                    total_params_path = "./model/%s-%04d.params" % (
                        self.config.model_prefix, epoch + 1)
                    # total_params_path = "./model/mobilenet_flodbn_0904/mobilenet_int8_flodbn_imagenet_retrain_80_pertensor-fold-0100.params"
                    # _, arg_params, aux_params = mx.model.load_checkpoint('./model/mobilenet_flodbn_0904/mobilenet_int8_flodbn_imagenet_retrain_80_pertensor-fold', 100)
                    import os
                    assert os.path.exists(
                        total_params_path
                    ), "please provide the correct total_params_path for foldbn eval"
                    eval_sym = eval(self.config.network)(
                        num_classes=self.config.num_classes,
                        quant_mod=self.config.quant_mod,
                        delay_quant=self.config.delay_quant,
                        is_weight_perchannel=self.config.is_weight_perchannel,
                        total_params_path=total_params_path,
                        quantize_flag=self.config.quantize_flag)
                    eval_module = Module(
                        symbol=eval_sym,
                        data_names=self.data_names,
                        label_names=self.label_names,
                        logger=self.logger,
                        context=self.context,
                        work_load_list=self.work_load_list,
                        fixed_param_names=self.fixed_param_names)
                    eval_module.bind(data_shapes=self.data_shapes,
                                     label_shapes=self.label_shapes,
                                     for_training=False)
                    eval_module.init_params(initializer=initializer,
                                            arg_params=arg_params,
                                            aux_params=aux_params)
                    res = eval_module.score(eval_data,
                                            validate_metric,
                                            score_end_callback=None,
                                            batch_end_callback=None,
                                            reset=True,
                                            epoch=epoch)
                else:
                    res = self.module.score(eval_data,
                                            validate_metric,
                                            score_end_callback=None,
                                            batch_end_callback=None,
                                            reset=True,
                                            epoch=epoch)
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            train_data.reset()
Ejemplo n.º 15
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None):

        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind)
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################

        last_grad_debug = None

        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)
            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                self.forward_backward(data_batch)

                # grad_array = [[grad.copyto(grad.context) if grad is not None else None for grad in grads] for grads in
                #             self._curr_module._exec_group.grad_arrays]
                #
                # for exec_ in self._curr_module._exec_group.execs:
                #     grad_dict = exec_.grad_dict
                #
                # grad_debug = dict()
                # for k, v in grad_dict.items():
                #     if v is not None:
                #         v_np = v.asnumpy()
                #         grad_debug[k] = (np.min(v_np), np.max(v_np))
                # print 'rpn_conv_cls_weight:', grad_debug['rpn_conv_cls_weight']
                # print 'rcnn_fc_cls_weight:', grad_debug['rcnn_fc_cls_weight']

                self.update()
                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 16
0
    def fit(self, X, y=None, eval_data=None, eval_metric='acc',
            epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None,
            work_load_list=None, monitor=None, eval_batch_end_callback=None):
        """Fit the model.

        Parameters
        ----------
        X : DataIter, or numpy.ndarray/NDArray
            Training data. If X is an DataIter, the name or, if not available,
            position, of its outputs should match the corresponding variable
            names defined in the symbolic graph.
        y : numpy.ndarray/NDArray, optional
            Training set label.
            If X is numpy.ndarray/NDArray, y is required to be set.
            While y can be 1D or 2D (with 2nd dimension as 1), its 1st dimension must be
            the same as X, i.e. the number of data points and labels should be equal.
        eval_data : DataIter or numpy.ndarray/list/NDArray pair
            If eval_data is numpy.ndarray/list/NDArray pair,
            it should be (valid_data, valid_label).
        eval_metric : metric.EvalMetric or str or callable
            The evaluation metric, name of evaluation metric.
            Or a customize evaluation function that returns the statistics
            based on minibatch.
        epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
            A callback that is invoked at end of each epoch.
            This can be used to checkpoint model each epoch.
        batch_end_callback: callable(epoch)
            A callback that is invoked at end of each batch
            For print purpose
        kvstore: KVStore or str, optional
           The KVStore or a string kvstore type: 'local', 'dist_sync', 'dist_async'
           In default uses 'local', often no need to change for single machiine.
        logger : logging logger, optional
            When not specified, default logger will be used.
        work_load_list : float or int, optional
            The list of work load for different devices,
            in the same order as ctx

        Note
        ----
        KVStore behavior
        - 'local', multi-devices on a single machine, will automatically choose best type.
        - 'dist_sync', multi-machines with BSP
        - 'dist_async', multi-machines with partical asynchronous
        """

        data = self._init_iter(X, y, is_train=True)
        eval_data = self._init_eval_iter(eval_data)

        if self.sym_gen:
            self.symbol = self.sym_gen(data.default_bucket_key) # pylint: disable=no-member
            self._check_arguments()
        self.kwargs["sym"] = self.symbol

        arg_names, param_names, aux_names = \
                self._init_params(dict(data.provide_data+data.provide_label))
        param_idx2name = {}
        for i, n in enumerate(param_names):
            for k in range(len(self.ctx)):
                param_idx2name[i*len(self.ctx)+k] = n
        self.kwargs["param_idx2name"] = param_idx2name

        # setup metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        # create kvstore
        (kvstore, update_on_kvstore) = _create_kvstore(
            kvstore, len(self.ctx), self.arg_params)

        # init optmizer
        if isinstance(self.optimizer, str):
            batch_size = data.batch_size
            if kvstore and kvstore.type == 'dist_sync':
                batch_size *= kvstore.num_workers
            optimizer = opt.create(self.optimizer,
                                   rescale_grad=(1.0/batch_size),
                                   **(self.kwargs))
        elif isinstance(self.optimizer, opt.Optimizer):
            optimizer = self.optimizer

        # do training
        _train_multi_device(self.symbol, self.ctx, arg_names, param_names, aux_names,
                            self.arg_params, self.aux_params,
                            begin_epoch=self.begin_epoch, end_epoch=self.num_epoch,
                            epoch_size=self.epoch_size,
                            optimizer=optimizer,
                            train_data=data, eval_data=eval_data,
                            eval_metric=eval_metric,
                            epoch_end_callback=epoch_end_callback,
                            batch_end_callback=batch_end_callback,
                            kvstore=kvstore, update_on_kvstore=update_on_kvstore,
                            logger=logger, work_load_list=work_load_list, monitor=monitor,
                            eval_batch_end_callback=eval_batch_end_callback,
                            sym_gen=self.sym_gen)
Ejemplo n.º 17
0
    def train_model(self, train_iter, batch_size: int, epochs: int,
                    num_features, optimizer: str, learning_rate: float,
                    momentum: float, score_after: float, eval_metric: str):
        """
        Parameters
        ----------
        eval_metric - "accuracy", "ce" (CrossEntropy), "f1", "mae", "mse",
                      "rmse", "top_k_accuracy".
        """
        # Set Monitor
        monitor = mon.Monitor(interval=score_after,
                              pattern=".*",
                              stat_func=self.loss_fn)
        self.train_module.install_monitor(monitor)

        # Create Metric
        eval_metric_fn = metric.create(eval_metric)
        self.train_module.bind(data_shapes=io.DataDesc(name="data",
                                                       shape=(batch_size,
                                                              num_features)),
                               label_shapes=io.DataDesc(name="target",
                                                        shape=(batch_size, 1)),
                               for_training=True)
        self.train_module.init_optimizer(optimizer=optimizer,
                                         optimizer_params={
                                             "learning_rate": learning_rate,
                                             "momentum": momentum
                                         })
        self.train_module.init_params()

        for epoch in range(epochs):
            eval_metric_fn.reset()
            end_of_batch = False
            tic = time()
            nbatch = 0
            while not end_of_batch:
                try:
                    train_batch = train_iter.next()
                    monitor.tic()
                    self.train_module.forward_backward(train_batch)
                    self.train_module.update()
                    self.train_module.update_metric(eval_metric_fn,
                                                    train_batch.label)

                    for name, value in eval_metric_fn.get():
                        self.writer.add_scalar(tag="Train " + name,
                                               value=value,
                                               global_step=(epoch + 1) *
                                               nbatch)
                        print("Epoch[%d] Batch[%d] Train-%s=%.3f" %
                              (epoch, nbatch, name, value))
                except Exception:
                    end_of_batch = True
                nbatch += 1

            print("Epoch[%d] completed! Time cost=%.3f s" % (epoch,
                                                             (time() - tic)))

            for grad in self.train_module.get_input_grads():
                self.writer.add_histogram(values=grad,
                                          bins=1000,
                                          global_step=epoch)
Ejemplo n.º 18
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            period=['train', 'val'],
            to_eval_train=True,
            grad_req='write',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            logger=None):

        if logger is None:
            logger = logging
        logging.info('Start training with %s', str(self.ctx))
        # region 1. 准备参数,包括输入数据和标签数据
        # FCN的参数名
        arg_names = self.symbol.list_arguments()
        # FCN的参数形状
        # print train_data.provide_data[0]
        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(
            data=train_data.provide_data[0][1])
        # arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=(1, 3,
        #                                                                    train_data.resize_size[0],
        #                                                                    train_data.resize_size[1],
        #                                                                    ))
        # print train_data.provide_data[0][1]
        # quit()
        # 输入数据和标签数据
        data_name = train_data.provide_data[0][0]
        label_name = train_data.provide_label[0][0]
        # print data_name, label_name
        # input_names = [data_name, label_name]
        # batch_size, channel, h, w
        # data_shape = train_data.provide_data[0][1]
        self.arg_params[data_name] = mx.nd.empty(train_data.provide_data[0][1],
                                                 self.ctx)
        # # batch_size, h*w
        self.arg_params[label_name] = mx.nd.empty(
            train_data.provide_label[0][1], self.ctx)
        # quit()
        # 其他参数
        aux_names = self.symbol.list_auxiliary_states()
        self.aux_params = {
            k: mx.nd.zeros(s)
            for k, s in zip(aux_names, aux_shapes)
        }
        # endregion

        # region 2.准备参数的梯度
        if grad_req != 'null':
            self.grad_params = {}
            for name, shape in zip(arg_names, arg_shapes):
                if not (name.endswith('data') or name.endswith('label')):
                    # print name,shape
                    self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
        else:
            self.grad_params = None
        # endregion
        # print self.arg_params
        # region 3. 绑定模型参数 和 模型的输出
        self.executor = self.symbol.bind(self.ctx,
                                         self.arg_params,
                                         args_grad=self.grad_params,
                                         grad_req=grad_req,
                                         aux_states=self.aux_params)
        # quit()
        assert len(self.symbol.list_arguments()) == len(
            self.executor.grad_arrays)
        # 绑定输出变量
        output_dict = {}
        output_buff = {}
        for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs):
            # print key, arr
            output_dict[key] = arr
            output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu())
        # endregion

        # region 4. 设置优化器
        self.optimizer = opt.create(self.optimizer,
                                    rescale_grad=1.0 / train_data.batch_size,
                                    **self.kwargs)
        self.updater = get_updater(self.optimizer)
        # 需要更新梯度的参数
        update_dict = {
            name: nd
            for name, nd in zip(self.symbol.list_arguments(),
                                self.executor.grad_arrays) if nd is not None
        }
        # endregion

        # region 5. 设置评价尺度
        if eval_metric == 'acc':
            eval_metric = metric.create(eval_metric)
        elif eval_metric == 'meanIOU':
            eval_metric = MeanIoU(c=1, )
        # endregion

        for epoch in range(self.begin_epoch, self.num_epoch):
            # region begin training
            if 'train' in period:
                logger.info(" in train process...")
                all_start = time.time()
                nbatch = 0
                train_data.reset()
                eval_metric.reset()
                for data in train_data:
                    nbatch += 1
                    # all_start = time.time()
                    # region 1. 准备 batch 数据
                    # start = time.time()
                    self.arg_params[data_name][:] = data.data[0]
                    # end = time.time()
                    # print end-start
                    # label_shape = data.label[0].shape
                    # print label_shape
                    self.arg_params[label_name][:] = data.label[0]
                    # end = time.time()
                    # print 'prepare data and label time: %s s' % (end - start)
                    # quit()
                    # print self.arg_params[label_name][:]
                    # endregion

                    # region 2. forward
                    # start = time.time()
                    self.executor.forward(is_train=True)
                    # end = time.time()
                    # print 'forward time: %s s' % (end - start)

                    # endregion

                    # region 3. backward
                    # start = time.time()
                    self.executor.backward()
                    for key, arr in update_dict.items():
                        if key != "bigscore_weight":
                            # 参数名,梯度, 权重
                            self.updater(key, arr, self.arg_params[key])
                            # self.executor.outputs[0].wait_to_read()
                    # end = time.time()
                    # print 'backward time: %f s' % (end - start)
                    # endregion

                    # region 4. 测评
                    # start = time.time()
                    if to_eval_train:
                        # start = time.time()
                        # 取得输出
                        for key in output_dict:
                            # print key
                            output_dict[key].copyto(output_buff[key])
                            # output_dict[key].wait_to_read()
                        # end = time.time()
                        # print 'output1 copy time: %s s' % (end - start)
                        # start = time.time()
                        pred_shape = output_buff['softmax_output'].shape
                        # print pred_shape, label_shape
                        # label = self.arg_params[label_name]
                        pred = output_buff['softmax_output'].reshape(
                            (pred_shape[0], pred_shape[1],
                             pred_shape[2] * pred_shape[3]))
                        # pred = pred.copyto(self.ctx)
                        # print pred.shape
                        label = data.label[0]
                        # quit()
                        # end = time.time()
                        # print 'output copy2 time: %s s' % (end - start)
                        # 更新评价
                        eval_metric.update([label], [pred])
                    batch_end_params = BatchEndParam(
                        epoch=epoch,
                        nbatch=nbatch,
                        eval_metric=eval_metric if to_eval_train else None,
                    )
                    batch_end_callback(batch_end_params)
                    # end = time.time()
                    # print '测评 time: %s s' % (end - start)
                    # endregion
                    # all_end = time.time()
                    # print 'all time: %s s' % (all_end - all_start)
                    # if nbatch > 1:
                    #     quit()
                if epoch_end_callback is not None:
                    epoch_end_callback(epoch, self.symbol, self.arg_params,
                                       self.aux_params)

                # all_end = time.time()
                # print 'all time1: %s s' % (all_end - all_start)
                if to_eval_train:
                    name, value = eval_metric.get()
                    logger.info(
                        "                     --->Epoch[%d] Train-%s=%f",
                        epoch, name, value)
                logger.info('train time per epoch: %f s' %
                            (time.time() - all_start))
            # endregion
            # evaluation
            if 'val' in period and eval_data:
                logger.info(" in eval process...")
                nbatch = 0
                eval_data.reset()
                eval_metric.reset()
                # all_start = time.time()
                for data in eval_data:
                    nbatch += 1
                    # label_shape = data.label.shape

                    self.arg_params[data_name][:] = data.data[0]
                    self.arg_params[label_name][:] = data.label[0]

                    self.executor.forward(is_train=False)
                    pred_shape = self.executor.outputs[0].shape

                    cpu_output_array = mx.nd.empty(pred_shape)
                    self.executor.outputs[0].copyto(cpu_output_array)

                    label = data.label[0]

                    pred = cpu_output_array.reshape(
                        (pred_shape[0], pred_shape[1],
                         pred_shape[2] * pred_shape[3]))

                    eval_metric.update([label], [pred])

                    batch_end_params = BatchEndParam(
                        epoch=epoch,
                        nbatch=nbatch,
                        eval_metric=None,
                    )
                    batch_end_callback(batch_end_params)

                    # if nbatch>200:
                    #     quit()
                    # quit()
                    # self.executor.outputs[0].wait_to_read()
                # all_end = time.time()
                # print 'all time1: %s s' % (all_end - all_start)
                # all_start = time.time()
                name, value = eval_metric.get()
                logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
Ejemplo n.º 19
0
def update_network(queue, nn_update_idx, symbol_filename, params_filename,
                   convert_to_onnx, main_config, train_config: TrainConfig,
                   model_contender_dir):
    """
    Creates a new NN checkpoint in the model contender directory after training using the game files stored in the
     training directory
    :param queue: Queue object used to return items
    :param nn_update_idx: Defines how many updates of the nn has already been done. This index should be incremented
    after every update.
    :param symbol_filename: Architecture definition file
    :param params_filename: Weight file which will be loaded before training
    Updates the neural network with the newly acquired games from the replay memory
    :param convert_to_onnx: Boolean indicating if the network shall be exported to ONNX to allow TensorRT inference
    :param main_config: Dict of the main_config (imported from main_config.py)
    :param train_config: Dict of the train_config (imported from train_config.py)
    :param model_contender_dir: String of the contender directory path
    :return: k_steps_final
    """

    # set the context on CPU, switch to GPU if there is one available (strongly recommended for training)
    ctx = mx.gpu(
        train_config.device_id) if train_config.context == "gpu" else mx.cpu()
    # set a specific seed value for reproducibility
    train_config.nb_parts = len(
        glob.glob(main_config["planes_train_dir"] + '**/*.zip'))
    logging.info("number parts for training: %d" % train_config.nb_parts)
    train_objects = TrainObjects()

    if train_config.nb_parts <= 0:
        raise Exception(
            'No .zip files for training available. Check the path in main_config["planes_train_dir"]:'
            ' %s' % main_config["planes_train_dir"])

    _, x_val, y_val_value, y_val_policy, _, _ = load_pgn_dataset(
        dataset_type="val",
        part_id=0,
        normalize=train_config.normalize,
        verbose=False,
        q_value_ratio=train_config.q_value_ratio)
    y_val_policy = prepare_policy(y_val_policy,
                                  train_config.select_policy_from_plane,
                                  train_config.sparse_policy_label,
                                  train_config.is_policy_from_plane_data)
    val_dataset = gluon.data.ArrayDataset(nd.array(x_val),
                                          nd.array(y_val_value),
                                          nd.array(y_val_policy))
    val_data = gluon.data.DataLoader(val_dataset,
                                     train_config.batch_size,
                                     shuffle=False,
                                     num_workers=train_config.cpu_count)

    symbol = mx.sym.load(symbol_filename)

    # calculate how many iterations per epoch exist
    nb_it_per_epoch = (len(x_val) *
                       train_config.nb_parts) // train_config.batch_size
    # one iteration is defined by passing 1 batch and doing backprop
    train_config.total_it = int(nb_it_per_epoch *
                                train_config.nb_training_epochs)

    train_objects.lr_schedule = CosineAnnealingSchedule(
        train_config.min_lr, train_config.max_lr,
        max(train_config.total_it * .7, 1))
    train_objects.lr_schedule = LinearWarmUp(train_objects.lr_schedule,
                                             start_lr=train_config.min_lr,
                                             length=max(
                                                 train_config.total_it * .25,
                                                 1))
    train_objects.momentum_schedule = MomentumSchedule(
        train_objects.lr_schedule, train_config.min_lr, train_config.max_lr,
        train_config.min_momentum, train_config.max_momentum)

    input_shape = x_val[0].shape
    inputs = mx.sym.var('data', dtype='float32')
    value_out = symbol.get_internals()[main_config['value_output'] + '_output']
    policy_out = symbol.get_internals()[main_config['policy_output'] +
                                        '_output']
    sym = mx.symbol.Group([value_out, policy_out])
    net = mx.gluon.SymbolBlock(sym, inputs)
    net.collect_params().load(params_filename, ctx)

    metrics_gluon = {
        'value_loss':
        metric.MSE(name='value_loss', output_names=['value_output']),
        'value_acc_sign':
        metric.create(acc_sign,
                      name='value_acc_sign',
                      output_names=['value_output'],
                      label_names=['value_label']),
    }

    if train_config.sparse_policy_label:
        print("train with sparse labels")
        # the default cross entropy only supports sparse labels
        metrics_gluon['policy_loss'] = metric.CrossEntropy(
            name='policy_loss',
            output_names=['policy_output'],
            label_names=['policy_label']),
        metrics_gluon['policy_acc'] = metric.Accuracy(
            axis=1,
            name='policy_acc',
            output_names=['policy_output'],
            label_names=['policy_label'])
    else:
        metrics_gluon['policy_loss'] = metric.create(
            cross_entropy,
            name='policy_loss',
            output_names=['policy_output'],
            label_names=['policy_label'])
        metrics_gluon['policy_acc'] = metric.create(
            acc_distribution,
            name='policy_acc',
            output_names=['policy_output'],
            label_names=['policy_label'])

    train_objects.metrics = metrics_gluon

    train_config.export_weights = False  # don't save intermediate weights
    train_agent = TrainerAgent(net,
                               val_data,
                               train_config,
                               train_objects,
                               use_rtpt=False)

    # iteration counter used for the momentum and learning rate schedule
    cur_it = train_config.k_steps_initial * train_config.batch_steps
    (k_steps_final, val_value_loss_final, val_policy_loss_final,
     val_value_acc_sign_final,
     val_policy_acc_final), _ = train_agent.train(cur_it)

    prefix = "%smodel-%.5f-%.5f-%.3f-%.3f" % (
        model_contender_dir, val_value_loss_final, val_policy_loss_final,
        val_value_acc_sign_final, val_policy_acc_final)

    sym_file = prefix + "-symbol.json"
    params_file = prefix + "-" + "%04d.params" % nn_update_idx

    # the export function saves both the architecture and the weights
    net.export(prefix, epoch=nn_update_idx)
    print()
    logging.info("Saved checkpoint to %s-%04d.params", prefix, nn_update_idx)

    if convert_to_onnx:
        convert_mxnet_model_to_onnx(sym_file, params_file,
                                    ["value_out_output", "policy_out_output"],
                                    input_shape, [1, 8, 16], False)

    logging.info("k_steps_final %d" % k_steps_final)
    queue.put(k_steps_final)
Ejemplo n.º 20
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            grad_req='write',
            logger=None,
            softmax_metric=None,
            regression_metric=None,
            epoch_end_callback=None):

        f = open("log_rpn.txt", 'w')
        if logger is None:
            logger = logging
        logging.info('Start training with %s', str(self.ctx))
        f.write('Start training with %s\n' % str(self.ctx))
        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(
            data=(1, 3, 128, 128),
            mean_face=(10, 3),
            ground_truth=(10, 2),
            bbox_label=(10, 5))
        arg_names = self.symbol.list_arguments()
        if grad_req != 'null':
            self.grad_params = {}
            for name, shape in zip(arg_names, arg_shapes):
                if not (name.endswith('data') or name.endswith("mean_face")
                        or name.endswith('cls_label')
                        or name.endswith('proj_weight')
                        or name.endswith('proj_label')
                        or name.endswith('ground_truth')
                        or name.endswith('bbox_label')
                        or name.endswith("bbox_weight")):
                    self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
        else:
            self.grad_params = None

        aux_names = self.symbol.list_auxiliary_states()
        self.aux_params = {
            k: mx.nd.zeros(s, self.ctx)
            for k, s in zip(aux_names, aux_shapes)
        }

        data_name = train_data.data_name
        cls_label_name = train_data.cls_label_name
        proj_label_name = train_data.proj_label_name
        proj_weight_name = train_data.proj_weight_name
        ground_truth_name = train_data.ground_truth_name
        bbox_label_name = train_data.bbox_label_name
        bbox_weight_name = train_data.bbox_weight_name

        self.optimizer = opt.create(self.optimizer,
                                    rescale_grad=1.0,
                                    **(self.kwargs))
        self.updater = get_updater(self.optimizer)
        eval_metric = metric.create(eval_metric)

        for epoch in range(self.begin_epoch, self.num_epoch):
            if eval_data:
                logger.info(" in eval process...")
                f.write(" in eval process...")
                nbatch = 0
                softmax_proj = np.zeros((11, 3))
                proj_regression_loss = .0
                bbox_predict_loss = np.array([.0, .0])
                eval_data.reset()
                for data in eval_data:
                    nbatch += 1
                    print "Eval batch:", nbatch
                    softmax_shape = data[cls_label_name].shape
                    self.arg_params[data_name] = mx.nd.array(
                        data[data_name], self.ctx)
                    self.arg_params[cls_label_name] = mx.nd.array(
                        data[cls_label_name].reshape(
                            (softmax_shape[0],
                             softmax_shape[1] * softmax_shape[2])), self.ctx)
                    self.arg_params[proj_label_name] = mx.nd.array(
                        data[proj_label_name], self.ctx)
                    self.arg_params[proj_weight_name] = mx.nd.array(
                        data[proj_weight_name], self.ctx)
                    self.arg_params[ground_truth_name] = mx.nd.array(
                        data[ground_truth_name], self.ctx)
                    self.arg_params[bbox_label_name] = mx.nd.array(
                        data[bbox_label_name], self.ctx)
                    self.arg_params[bbox_weight_name] = mx.nd.array(
                        data[bbox_weight_name], self.ctx)
                    self.arg_params["mean_face"] = mx.nd.array(
                        train_data.mean_face, self.ctx)

                    executor = self.symbol.bind(self.ctx,
                                                self.arg_params,
                                                args_grad=self.grad_params,
                                                grad_req=grad_req,
                                                aux_states=self.aux_params)

                    softmax_output_array = mx.nd.zeros(
                        executor.outputs[0].shape)
                    proj_regression_output_array = mx.nd.zeros(
                        executor.outputs[1].shape)
                    bbox_predict_output_array = mx.nd.zeros(
                        executor.outputs[2].shape)
                    ell_label = mx.nd.zeros(executor.outputs[3].shape)
                    bbox_predict = mx.nd.zeros(executor.outputs[4].shape)
                    executor.forward(is_train=True)
                    executor.outputs[0].copyto(softmax_output_array)
                    executor.outputs[1].copyto(proj_regression_output_array)
                    executor.outputs[2].copyto(bbox_predict_output_array)
                    executor.outputs[3].copyto(ell_label)
                    executor.outputs[4].copyto(bbox_predict)

                    softmax_shape = softmax_output_array.shape
                    index_label = np.nonzero(data[cls_label_name].reshape(
                        softmax_shape[0], softmax_shape[2] *
                        softmax_shape[3]) - 255)
                    label = mx.nd.array(data[cls_label_name].reshape(
                        softmax_shape[0],
                        softmax_shape[2] * softmax_shape[3])[:,
                                                             index_label[1]])
                    pred = mx.nd.array((softmax_output_array.asnumpy().reshape(
                        softmax_shape[0], softmax_shape[1],
                        softmax_shape[2] * softmax_shape[3]))[...,
                                                              index_label[1]])
                    if softmax_metric:
                        tempt = softmax_metric(label, pred, 11)
                        softmax_proj += tempt

                    proj_label = data[proj_label_name]
                    proj_weight = data[proj_weight_name]
                    proj_pred = proj_regression_output_array.asnumpy().reshape(
                        data[proj_weight_name].shape)
                    index_nonzero = np.nonzero(data[proj_weight_name])
                    proj_regress_tmp = regression_metric(
                        proj_label[index_nonzero], proj_pred[index_nonzero],
                        proj_weight[index_nonzero])
                    proj_regression_loss += proj_regress_tmp

                    bbox_pred = bbox_predict_output_array.asnumpy()
                    bbox_predict_tmp = bbox_predict_metric(
                        ell_label.asnumpy(), bbox_pred)
                    bbox_predict_loss += bbox_predict_tmp

                    print "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \
                          (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp,
                           bbox_predict_tmp[0], bbox_predict_tmp[1])
                    f.write(
                        "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n"
                        % (epoch, nbatch, get_accuracy(
                            tempt, self.bgfg), proj_regress_tmp,
                           bbox_predict_tmp[0], bbox_predict_tmp[1]))

                    img_info = eval_data.AllImg[nbatch - 1]
                    print "%s\twidth: %d height: %d num_face: %d" % \
                          (img_info.filename, img_info.width, img_info.height, img_info.num_faces)
                    f.write("%s\twidth: %d height: %d num_face: %d\n" %
                            (img_info.filename, img_info.width,
                             img_info.height, img_info.num_faces))

                    executor.outputs[0].wait_to_read()
                    executor.outputs[1].wait_to_read()
                    executor.outputs[2].wait_to_read()
                    executor.outputs[3].wait_to_read()

                print_accuracy(softmax_proj, f, train_data.class_names,
                               self.bgfg)
                logger.info("ALL Validation accuracy: %f",
                            get_accuracy(softmax_proj, self.bgfg))
                logger.info('Validation projection regression: %f',
                            proj_regression_loss / nbatch)
                logger.info('Validation bbox predict: %f %f',
                            bbox_predict_loss[0] / nbatch,
                            bbox_predict_loss[1] / nbatch)
                f.write("ALL Validation accuracy: %f\n" %
                        get_accuracy(softmax_proj, self.bgfg))
                f.write("Validation projection regression: %f\n" %
                        (proj_regression_loss / nbatch))
                f.write("Validation bbox predict: %f %f\n" %
                        (bbox_predict_loss[0] / nbatch,
                         bbox_predict_loss[1] / nbatch))

            nbatch = 0
            train_data.reset()
            eval_metric.reset()
            proj_regress_loss_t = .0
            proj_regress_loss_b = .0
            softmax_count = np.zeros((11, 3))
            softmax_batch = np.zeros((11, 3))
            bbox_predict_loss_t = np.array([.0, .0])
            bbox_predict_loss_b = np.array([.0, .0])
            for data in train_data:
                nbatch += 1
                softmax_shape = data[cls_label_name].shape
                self.arg_params[data_name] = mx.nd.array(
                    data[data_name], self.ctx)
                self.arg_params[cls_label_name] = mx.nd.array(
                    data[cls_label_name].reshape(
                        (softmax_shape[0],
                         softmax_shape[1] * softmax_shape[2])), self.ctx)
                self.arg_params[proj_label_name] = mx.nd.array(
                    data[proj_label_name], self.ctx)
                self.arg_params[proj_weight_name] = mx.nd.array(
                    data[proj_weight_name], self.ctx)
                self.arg_params[ground_truth_name] = mx.nd.array(
                    data[ground_truth_name], self.ctx)
                self.arg_params[bbox_label_name] = mx.nd.array(
                    data[bbox_label_name], self.ctx)
                self.arg_params[bbox_weight_name] = mx.nd.array(
                    data[bbox_weight_name], self.ctx)
                self.arg_params["mean_face"] = mx.nd.array(
                    train_data.mean_face, self.ctx)

                self.executor = self.symbol.bind(self.ctx,
                                                 self.arg_params,
                                                 args_grad=self.grad_params,
                                                 grad_req=grad_req,
                                                 aux_states=self.aux_params)
                assert len(self.symbol.list_arguments()) == len(
                    self.executor.grad_arrays)

                update_dict = {
                    name: nd
                    for name, nd in zip(self.symbol.list_arguments(),
                                        self.executor.grad_arrays) if nd
                }
                output_dict = {}
                output_buff = {}
                for key, arr in zip(self.symbol.list_outputs(),
                                    self.executor.outputs):
                    output_dict[key] = arr
                    output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu())
                self.executor.forward(is_train=True)
                for key in output_dict:
                    output_dict[key].copyto(output_buff[key])
                self.executor.backward()
                '''
                for i in xrange(0, 49):
                    if self.executor.grad_arrays[i] != None:
                        print i, arg_names[i], self.executor.grad_arrays[i].asnumpy()[0]
                '''

                for key, arr in update_dict.items():
                    if key != 'upsample_proposal_weight':
                        self.updater(key, arr, self.arg_params[key])
                        '''
                        if key == 'config_fc1_weight':
                            print 'config_fc1_weight'
                            print 'param:', self.arg_params[key].asnumpy()
                            print 'grad:', self.executor.grad_arrays[39].asnumpy()
                        if key == 'refine_proj_param_weight':
                            print 'refine_proj_param_weight'
                            print 'param:', self.arg_params[key].asnumpy()
                            print 'grad:', self.executor.grad_arrays[47].asnumpy()
                        '''

                pred_shape = self.executor.outputs[0].shape
                index_label = np.nonzero(data[cls_label_name].reshape(
                    softmax_shape[0], softmax_shape[1] * softmax_shape[2]) -
                                         255)
                label = mx.nd.array(data[cls_label_name].reshape(
                    softmax_shape[0],
                    softmax_shape[1] * softmax_shape[2])[:, index_label[1]])
                pred = mx.nd.array(
                    (output_buff["proposal_cls_loss_output"].asnumpy().reshape(
                        pred_shape[0], pred_shape[1],
                        pred_shape[2] * pred_shape[3]))[..., index_label[1]])
                if softmax_metric:
                    tempt = softmax_metric(label, pred, 11)
                    softmax_count += tempt
                    softmax_batch += tempt

                # for q in range(0, 50):
                #    print label.asnumpy()[0, q], ':', pred.asnumpy()[0, 0, q], pred.asnumpy()[0, 1, q]

                proj_label = data[proj_label_name]
                proj_weight = data[proj_weight_name]
                proj_pred = output_buff["proj_regression_loss_output"].asnumpy()\
                    .reshape(data[proj_weight_name].shape)
                index_nonzero = np.nonzero(data[proj_weight_name])
                proj_regress_tmp = regression_metric(
                    proj_label[index_nonzero], proj_pred[index_nonzero],
                    proj_weight[index_nonzero])
                proj_regress_loss_t += proj_regress_tmp
                proj_regress_loss_b += proj_regress_tmp

                ell_label = output_buff["ell_label_output"].asnumpy()
                bbox_pred = output_buff["ellipse_predict_loss_output"].asnumpy(
                )
                bbox_predict_tmp = bbox_predict_metric(ell_label, bbox_pred)
                bbox_predict_loss_t += bbox_predict_tmp
                bbox_predict_loss_b += bbox_predict_tmp

                self.executor.outputs[0].wait_to_read()
                self.executor.outputs[1].wait_to_read()
                self.executor.outputs[2].wait_to_read()
                self.executor.outputs[3].wait_to_read()

                print "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \
                      (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp,
                       bbox_predict_tmp[0], bbox_predict_tmp[1])
                f.write(
                    "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n"
                    % (epoch, nbatch, get_accuracy(
                        tempt, self.bgfg), proj_regress_tmp,
                       bbox_predict_tmp[0], bbox_predict_tmp[1]))

                img_info = train_data.AllImg[nbatch - 1]
                print "%s\twidth: %d height: %d num_face: %d" % \
                      (img_info.filename, img_info.width, img_info.height, img_info.num_faces)
                f.write("%s\twidth: %d height: %d num_face: %d\n" % \
                        (img_info.filename, img_info.width, img_info.height, img_info.num_faces))

                if nbatch % 50 == 0:
                    print_accuracy(softmax_batch, f, train_data.class_names,
                                   self.bgfg)
                    softmax_batch = np.zeros((11, 3))
                    print "Keypoints projection regression smoothl1 loss:\t", proj_regress_loss_b / 50
                    f.write(
                        "Keypoints projection regression smoothl1 loss:\t%f\n"
                        % (proj_regress_loss_b / 50))
                    print "Bounding box regression:\t", bbox_predict_loss_b / 50
                    f.write("Bounding box regression: %f %f\n" %
                            (bbox_predict_loss_b[0] / 50,
                             bbox_predict_loss_b[1] / 50))
                    #print "Keypoints offset regression smoothl1 loss:\t", offset_regress_loss_b / 50
                    #f.write("Keypoints offset regression smoothl1 loss:\t%f\n" % (offset_regress_loss_b / 50))
                    #print "Keypoints visibility accuracy:\t", float(softmax_vis_batch[2]) / float(softmax_vis_batch[0])
                    #f.write("Keypoints visibility accuracy:\t%f\n" %
                    #        (float(softmax_vis_batch[2]) / float(softmax_vis_batch[0])))
                    softmax_vis_batch = np.zeros(3)
                    proj_regress_loss_b = .0
                    offset_regress_loss_b = .0
                    bbox_predict_loss_b = np.array([.0, .0])

                if nbatch % 1000 == 0:
                    if epoch_end_callback != None:
                        epoch_end_callback(epoch * 100000 + nbatch,
                                           self.symbol, self.arg_params,
                                           self.aux_params)

            name, value = eval_metric.get()
            print_accuracy(softmax_count, f, train_data.class_names, self.bgfg)
            logger.info("--->Epoch[%d] Train-cls-%s=%f", epoch, name, value)
            logger.info("--->Epoch[%d] Train-proj-reg-smoothl1=%f", epoch,
                        proj_regress_loss_t / nbatch)
            logger.info("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f", epoch,
                        bbox_predict_loss_t[0] / nbatch,
                        bbox_predict_loss_t[1] / nbatch)
            #logger.info("--->Epoch[%d] Train-offset-reg-smoothl1=%f", epoch, offset_regress_loss_t / nbatch)
            #logger.info("--->Epoch[%d] Train-vis-acc=%f", epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0]))
            f.write("--->Epoch[%d] Train-cls-%s=%f\n" % (epoch, name, value))
            f.write("--->Epoch[%d] Train-proj-reg-smoothl1=%f\n" %
                    (epoch, proj_regress_loss_t / nbatch))
            f.write("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f" %
                    (epoch, bbox_predict_loss_t[0] / nbatch,
                     bbox_predict_loss_t[1] / nbatch))
            #f.write("--->Epoch[%d] Train-offset-reg-smoothl1=%f\n" % (epoch, offset_regress_loss_t / nbatch))
            #f.write("--->Epoch[%d] Train-vis-acc=%f" % (epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0])))

        f.close()
Ejemplo n.º 21
0
 def fit(self,
         train_data,
         eval_data=None,
         eval_metric='acc',
         grad_req='write',
         epoch_end_callback=None,
         batch_end_callback=None,
         kvstore='local',
         logger=None):
     if logger is None:
         logger = logging
     logging.info('Start training with %s', str(self.ctx))
     arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(
         data=train_data.provide_data[0][1])
     arg_names = self.symbol.list_arguments()
     if grad_req != 'null':
         self.grad_params = {}
         for name, shape in zip(arg_names, arg_shapes):
             if not (name.endswith('data') or name.endswith('label')):
                 self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
     else:
         self.grad_params = None
     aux_names = self.symbol.list_auxiliary_states()
     self.aux_params = {
         k: nd.zeros(s)
         for k, s in zip(aux_names, aux_shapes)
     }
     data_name = train_data.data_name
     label_name = train_data.label_name
     input_names = [data_name, label_name]
     self.optimizer = opt.create(self.optimizer,
                                 rescale_grad=(1.0 /
                                               train_data.get_batch_size()),
                                 **(self.kwargs))
     self.updater = get_updater(self.optimizer)
     eval_metric = metric.create(eval_metric)
     # begin training
     for epoch in range(self.begin_epoch, self.num_epoch):
         nbatch = 0
         train_data.reset()
         eval_metric.reset()
         for data in train_data:
             nbatch += 1
             label_shape = data[label_name].shape
             self.arg_params[data_name] = mx.nd.array(
                 data[data_name], self.ctx)
             self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \
                 label_shape[1]*label_shape[2]), self.ctx)
             output_names = self.symbol.list_outputs()
             self.exector = self.symbol.bind(self.ctx,
                                             self.arg_params,
                                             args_grad=self.grad_params,
                                             grad_req=grad_req,
                                             aux_states=self.aux_params)
             assert len(self.symbol.list_arguments()) == len(
                 self.exector.grad_arrays)
             update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \
                 self.exector.grad_arrays) if nd is not None}
             output_dict = {}
             output_buff = {}
             for key, arr in zip(self.symbol.list_outputs(),
                                 self.exector.outputs):
                 output_dict[key] = arr
                 output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu())
             self.exector.forward(is_train=True)
             for key in output_dict:
                 output_dict[key].copyto(output_buff[key])
             self.exector.backward()
             for key, arr in update_dict.items():
                 if key != "bigscore_weight":
                     self.updater(key, arr, self.arg_params[key])
             pred_shape = self.exector.outputs[0].shape
             label = mx.nd.array(data[label_name].reshape(
                 label_shape[0], label_shape[1] * label_shape[2]))
             pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \
                 pred_shape[1], pred_shape[2]*pred_shape[3]))
             eval_metric.update([label], [pred])
             self.exector.outputs[0].wait_to_read()
             batch_end_params = BatchEndParam(epoch=epoch,
                                              nbatch=nbatch,
                                              eval_metric=eval_metric)
             batch_end_callback(batch_end_params)
         if epoch_end_callback is not None:
             epoch_end_callback(epoch, self.symbol, self.arg_params,
                                self.aux_params)
         name, value = eval_metric.get()
         logger.info("                     --->Epoch[%d] Train-%s=%f",
                     epoch, name, value)
         # evaluation
         if eval_data:
             logger.info(" in eval process...")
             nbatch = 0
             eval_data.reset()
             eval_metric.reset()
             for data in eval_data:
                 nbatch += 1
                 label_shape = data[label_name].shape
                 self.arg_params[data_name] = mx.nd.array(
                     data[data_name], self.ctx)
                 self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \
                     label_shape[1]*label_shape[2]), self.ctx)
                 exector = self.symbol.bind(self.ctx,
                                            self.arg_params,
                                            args_grad=self.grad_params,
                                            grad_req=grad_req,
                                            aux_states=self.aux_params)
                 cpu_output_array = mx.nd.zeros(exector.outputs[0].shape)
                 exector.forward(is_train=False)
                 exector.outputs[0].copyto(cpu_output_array)
                 pred_shape = cpu_output_array.shape
                 label = mx.nd.array(data[label_name].reshape(label_shape[0], \
                     label_shape[1]*label_shape[2]))
                 pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \
                     pred_shape[1], pred_shape[2]*pred_shape[3]))
                 eval_metric.update([label], [pred])
                 exector.outputs[0].wait_to_read()
         name, value = eval_metric.get()
         logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
    def fit(self, train_data, eval_data,
        eval_metric='mse',
        grad_req='write',
        epoch_end_callback=None,
        batch_end_callback=None,
        kv_store='local',
        logger=None):

        if logger is None:
            logger = logging
        logging.info('Starting training with %s', str(self.ctx))
        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data = train_data.provide_data[0][1])
        arg_names = self.symbol.list_arguments()

        if grad_req != 'null':
            self.grad_params = {}
            for name, shape in zip(arg_names, arg_shapes):
                if not (name.endswith('data') or name.endswith('label')):
                    self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
        #init the params
        pdb.set_trace()
        self.arg_params = {k : mx.nd.zeros(s, self.ctx) for k, s in zip(arg_names, arg_shapes)}
        for k, v in self.arg_params.items():
            if not (k.endswith('data') or k.endswith('label')):
                self.initializer(k, v)

        #init the aux params
        aux_names = self.symbol.list_auxiliary_states()
        self.aux_params = {k : mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}

        data_name = train_data.data_name
        label_name = train_data.label_name
        input_names = [data_name, label_name]

        self.optimizer = mx.optimizer.create(self.optimizer, rescale_grad = (1.0/train_data.get_batch_size()), **(self.kwargs))
        self.updater = mx.optimizer.get_updater(self.optimizer)

        eval_metric = metric.create(eval_metric)

        # begin training
        for epoch in range(self.begin_epoch, self.num_epoch):
            nbatch = 0
            train_data.reset()
            eval_metric.reset()

            #train
            for databatch in train_data:
                nbatch += 1
                for k, v in databatch.data.items():
                    self.arg_params[k] = mx.nd.array(v, self.ctx)
                for k, v in databatch.label.items():
                    self.arg_params[k] = mx.nd.array(v, self.ctx)
                executor = self.symbol.bind(self.ctx, self.arg_params, args_grad = self.grad_params,
                    grad_req = grad_req, aux_states = self.aux_params)
                # print(nbatch)
                if nbatch == 1550:
                    pdb.set_trace()
                update_dict = {name:nd for name, nd
                                in zip(self.symbol.list_arguments(), executor.grad_arrays) if nd}
                output_dict = {name:nd for name, nd
                                in zip(self.symbol.list_outputs(), executor.outputs)}
                # pdb.set_trace()
                executor.forward(is_train=True)
                executor.backward()

                for key, arr in update_dict.items():
                    self.updater(key, arr, self.arg_params[key])

                label = self.arg_params['lr_label']
                pred = output_dict['lr_output']
                eval_metric.update([label], [pred])
                executor.outputs[0].wait_to_read()
                batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric)
                batch_end_callback(batch_end_params)
            if epoch_end_callback != None:
                epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params)
            # pdb.set_trace()
            name, value = eval_metric.get()
            logger.info("------------------------------>Epoch[%d] Train-%s=%f", epoch, name, value)

            #begin evaluation
            if eval_data:
                logger.info( "in eval process...")
                nbatch = 0
                eval_data.reset()
                eval_metric.reset()
                for data in eval_data:
                    nbatch += 1
                    for k, v in databatch.data.items():
                        self.arg_params[k] = mx.nd.array(v, self.ctx)
                    for k, v in databatch.label.items():
                        self.arg_params[k] = mx.nd.array(v, self.ctx)
                    executor = self.symbol.bind(self.ctx, self.arg_params, args_grad = self.grad_params,
                        grad_req = grad_req, aux_states = self.aux_params)

                    output_dict = {name:nd for name, nd
                                    in zip(self.symbol.list_outputs(), executor.outputs)}
                    executor.forward(is_train=False)
                    label = self.arg_params['lr_label']
                    pred = output_dict['lr_output']
                    eval_metric.update([label], [pred])
            name, value = eval_metric.get()
            logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
Ejemplo n.º 23
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None,
            sparse_row_id_fn=None,
            profile=False):
        """Trains the module parameters.
        Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see
        a end-to-end use-case.
        Parameters
        ----------
        train_data : DataIter
            Train DataIter.
        eval_data : DataIter
            If not ``None``, will be used as validation set and the performance
            after each epoch will be evaluated.
        eval_metric : str or EvalMetric
            Defaults to 'accuracy'. The performance measure used to display during training.
            Other possible predefined metrics are:
            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
        epoch_end_callback : function or list of functions
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Defaults to 'local'.
        optimizer : str or Optimizer
            Defaults to 'sgd'.
        optimizer_params : dict
            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
            the optimizer constructor.
            The default value is not a dict, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each mini-batch during evaluation.
        initializer : Initializer
            The initializer is called to initialize the module parameters when they are
            not already initialized.
        arg_params : dict
            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has a higher priority than `initializer`.
        aux_params : dict
            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Defaults to ``False``. Whether to force rebinding the executors if already bound.
        force_init : bool
            Defaults to ``False``. Indicates whether to force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
            checkpoint saved at a previous training phase at epoch N, then this value should be
            N+1.
        num_epoch : int
            Number of epochs for training.
        sparse_row_id_fn : A callback function
            The function  takes `data_batch` as an input and returns a dict of
            str -> NDArray. The resulting dict is used for pulling row_sparse
            parameters from the kvstore, where the str key is the name of the param,
            and the value is the row id of the param to pull.
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind)

        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)
            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                self.forward_backward(data_batch)
                self.update()

                if isinstance(data_batch, list):
                    self.update_metric(eval_metric,
                                       [db.label for db in data_batch],
                                       pre_sliced=True)
                else:
                    self.update_metric(eval_metric, data_batch.label)

                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch,
                                 sparse_row_id_fn=sparse_row_id_fn)
                except StopIteration:
                    end_of_batch = True

                if monitor is not None:
                    monitor.toc_print()

                if end_of_batch:
                    eval_name_vals = eval_metric.get_name_value()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

                if profile is True and nbatch == 10:
                    self.logger.info("Profiling ends")
                    import mxnet as mx
                    mx.profiler.dump()

            # one epoch of training is finished
            for name, val in eval_name_vals:
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None and self._kvstore.rank == 0:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 24
0
    def fit(self):
        # kvstore
        if self.kv_store is 'local' and (self.gpus is None
                                         or len(self.gpus.split(',')) is 1):
            kv = None
        else:
            kv = mx.kvstore.create(self.kv_store)

        # setup module, including symbol, params and aux
        # get_model should always be called before get_data_iterator to ensure correct data loader
        self.get_model()

        # get dataloader
        train_data, eval_data = self.get_data_iterator()

        # evaluate metrics
        eval_metric_lst = []
        if "acc" in self.eval_metric:
            eval_metric_lst.append(metric.create(self.eval_metric))
        if "acc_ignore" in self.eval_metric and self.ignore_label is not None:
            eval_metric_lst.append(
                AccWithIgnoreMetric(self.ignore_label, name="acc_ignore"))
        if "IoU" in self.eval_metric and self.ignore_label is not None:
            eval_metric_lst.append(
                IoUMetric(self.ignore_label,
                          label_num=self.label_num,
                          name="IoU"))
        eval_metric_lst.append(
            SoftmaxLoss(self.ignore_label,
                        label_num=self.label_num,
                        name="SoftmaxLoss"))
        eval_metrics = CompositeEvalMetric(metrics=eval_metric_lst)

        optimizer_params = {}
        # optimizer
        # lr policy
        if self.lr_policy == 'step' and self.lr_factor < 1 and self.lr_factor_epoch > 0:
            optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
                step=max(int(self.epoch_size * self.lr_factor_epoch), 1),
                factor=self.lr_factor)
        elif self.lr_policy == 'poly':
            optimizer_params['lr_scheduler'] = lr_scheduler.PolyScheduler(
                origin_lr=self.lr,
                max_samples=max(int(self.epoch_size * self.num_epochs), 1),
                factor=self.lr_factor)
        else:
            logging.error('Unknown lr policy: %s' % self.lr_policy)
        optimizer_params['learning_rate'] = self.lr
        optimizer_params['momentum'] = self.momentum
        optimizer_params['wd'] = self.weight_decay
        optimizer_params['rescale_grad'] = 1.0 / self.batch_size
        optimizer_params['clip_gradient'] = 5

        # directory for saving models
        model_path = os.path.join(self.model_dir, self.save_model_prefix)
        if not os.path.isdir(model_path):
            os.mkdir(model_path)
        model_full_path = os.path.join(
            model_path,
            datetime.now().strftime('%Y_%m_%d_%H:%M:%S'))
        if not os.path.isdir(model_full_path):
            os.mkdir(model_full_path)
        checkpoint = utils.do_checkpoint(
            os.path.join(model_full_path, self.save_model_prefix),
            self.checkpoint_interval)
        with open(
                os.path.join(
                    model_full_path, 'train_' +
                    datetime.now().strftime('%Y_%m_%d_%H:%M:%S') + '.cfg'),
                'w') as f:
            self.config.write(f)
        utils.save_symbol(
            self.symbol, os.path.join(model_full_path, self.save_model_prefix))
        utils.save_log(self.save_model_prefix, model_full_path)

        # draw network
        if self.draw_network is True:
            utils.draw_network(
                self.symbol,
                os.path.join(model_full_path, self.save_model_prefix),
                self.data_shape[0])

        # batch_end_callback
        batch_end_callback = list()
        batch_end_callback.append(utils.Speedometer(self.batch_size, 10))

        module = mx.module.Module(self.symbol,
                                  context=self.ctx,
                                  data_names=self.data_name,
                                  label_names=self.label_name)

        # initialize (base_module now no more do this initialization)
        train_data.reset()
        module.fit(
            train_data=train_data,
            eval_data=eval_data,
            eval_metric=eval_metrics,
            epoch_end_callback=checkpoint,
            batch_end_callback=batch_end_callback,
            kvstore=kv,
            optimizer=self.optimizer,
            optimizer_params=optimizer_params,
            initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
            arg_params=self.arg_params,
            aux_params=self.aux_params,
            allow_missing=True,
            begin_epoch=self.load_epoch,
            num_epoch=self.num_epochs,
        )
Ejemplo n.º 25
0
 def fit(self, train_data, eval_data=None,
         eval_metric='acc',
         grad_req='write',
         epoch_end_callback=None,
         batch_end_callback=None,
         kvstore='local',
         logger=None):
     if logger is None:
         logger = logging
     logging.info('Start training with %s', str(self.ctx))
     arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=train_data.provide_data[0][1])
     arg_names = self.symbol.list_arguments()
     if grad_req != 'null':
         self.grad_params = {}
         for name, shape in zip(arg_names, arg_shapes):
             if not (name.endswith('data') or name.endswith('label')):
                 self.grad_params[name] = mx.nd.zeros(shape, self.ctx)
     else:
         self.grad_params = None
     aux_names = self.symbol.list_auxiliary_states()
     self.aux_params = {k : nd.zeros(s) for k, s in zip(aux_names, aux_shapes)}
     data_name = train_data.data_name
     label_name = train_data.label_name
     input_names = [data_name, label_name]
     self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0/train_data.get_batch_size()), **(self.kwargs))
     self.updater = get_updater(self.optimizer)
     eval_metric = metric.create(eval_metric)
     # begin training
     for epoch in range(self.begin_epoch, self.num_epoch):
         nbatch = 0
         train_data.reset()
         eval_metric.reset()
         for data in train_data:
             nbatch += 1
             label_shape = data[label_name].shape
             self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx)
             self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \
                 label_shape[1]*label_shape[2]), self.ctx)
             output_names = self.symbol.list_outputs()
             self.exector = self.symbol.bind(self.ctx, self.arg_params,
                             args_grad=self.grad_params,
                             grad_req=grad_req,
                             aux_states=self.aux_params)
             assert len(self.symbol.list_arguments()) == len(self.exector.grad_arrays)
             update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \
                 self.exector.grad_arrays) if nd}
             output_dict = {}
             output_buff = {}
             for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs):
                 output_dict[key] = arr
                 output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu())
             self.exector.forward(is_train=True)
             for key in output_dict:
                 output_dict[key].copyto(output_buff[key])
             self.exector.backward()
             for key, arr in update_dict.items():
                 if key != "bigscore_weight":
                     self.updater(key, arr, self.arg_params[key])
             pred_shape = self.exector.outputs[0].shape
             label = mx.nd.array(data[label_name].reshape(label_shape[0], label_shape[1]*label_shape[2]))
             pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \
                 pred_shape[1], pred_shape[2]*pred_shape[3]))
             eval_metric.update([label], [pred])
             self.exector.outputs[0].wait_to_read()
             batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric)
             batch_end_callback(batch_end_params)
         if epoch_end_callback is not None:
             epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params)
         name, value = eval_metric.get()
         logger.info("                     --->Epoch[%d] Train-%s=%f", epoch, name, value)
         # evaluation
         if eval_data:
             logger.info(" in eval process...")
             nbatch = 0
             eval_data.reset()
             eval_metric.reset()
             for data in eval_data:
                 nbatch += 1
                 label_shape = data[label_name].shape
                 self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx)
                 self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \
                     label_shape[1]*label_shape[2]), self.ctx)
                 exector = self.symbol.bind(self.ctx, self.arg_params,
                                 args_grad=self.grad_params,
                                 grad_req=grad_req,
                                 aux_states=self.aux_params)
                 cpu_output_array = mx.nd.zeros(exector.outputs[0].shape)
                 exector.forward(is_train=False)
                 exector.outputs[0].copyto(cpu_output_array)
                 pred_shape = cpu_output_array.shape
                 label = mx.nd.array(data[label_name].reshape(label_shape[0], \
                     label_shape[1]*label_shape[2]))
                 pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \
                     pred_shape[1], pred_shape[2]*pred_shape[3]))
                 eval_metric.update([label], [pred])
                 exector.outputs[0].wait_to_read()
         name, value = eval_metric.get()
         logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
Ejemplo n.º 26
0
    def fit(
        self,
        train_data,
        eval_data=None,
        eval_metric="acc",
        epoch_end_callback=None,
        batch_end_callback=None,
        kvstore="local",
        optimizer="sgd",
        optimizer_params=(("learning_rate", 0.01),),
        eval_end_callback=None,
        eval_batch_end_callback=None,
        initializer=Uniform(0.01),
        arg_params=None,
        aux_params=None,
        allow_missing=False,
        force_rebind=False,
        force_init=False,
        begin_epoch=0,
        num_epoch=None,
        validation_metric=None,
        monitor=None,
        prefix=None,
    ):
        """Train the module parameters.

        Parameters
        ----------
        train_data : DataIter
        eval_data : DataIter
            If not `None`, will be used as validation set and evaluate the performance
            after each epoch.
        eval_metric : str or EvalMetric
            Default `'acc'`. The performance measure used to display during training.
        epoch_end_callback : function or list of function
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Default `'local'`.
        optimizer : str or Optimizer
            Default `'sgd'`
        optimizer_params : dict
            Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor.
            The default value is not a `dict`, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each minibatch during evaluation
        initializer : Initializer
            Will be called to initialize the module parameters if not already initialized.
        arg_params : dict
            Default `None`, if not `None`, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has higher priority to `initializer`.
        aux_params : dict
            Default `None`. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Default `False`. Indicate whether we allow missing parameters when `arg_params`
            and `aux_params` are not `None`. If this is `True`, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Default `False`. Whether to force rebinding the executors if already binded.
        force_init : bool
            Default `False`. Indicate whether we should force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Default `0`. Indicate the starting epoch. Usually, if we are resuming from a
            checkpoint saved at a previous training phase at epoch N, then we should specify
            this value as N+1.
        num_epoch : int
            Number of epochs to run training.

        Examples
        --------
        An example of using fit for training::
            >>> #Assume training dataIter and validation dataIter are ready
            >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter,
                        optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
                        num_epoch=10)
        """
        assert num_epoch is not None, "please specify number of epochs"

        self.bind(
            data_shapes=train_data.provide_data,
            label_shapes=train_data.provide_label,
            for_training=True,
            force_rebind=force_rebind,
        )
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(
            initializer=initializer,
            arg_params=arg_params,
            aux_params=aux_params,
            allow_missing=allow_missing,
            force_init=force_init,
        )
        self.init_optimizer(
            kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params
        )

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        # epoch 0
        if epoch_end_callback is not None:
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)
            for callback in _as_list(epoch_end_callback):
                callback(-1, self.symbol, arg_params, aux_params)

        from lib.pair_matching.batch_updater_py_multi import batchUpdaterPyMulti

        config = self.config
        if config.TRAIN.TENSORBOARD_LOG:
            from mxboard import SummaryWriter

            tf_log_dir = os.path.join(
                os.path.dirname(prefix),
                "logs/{}".format(time.strftime("%Y-%m-%d-%H-%M")),
            )
            summ_writer = SummaryWriter(logdir=tf_log_dir)

        interBatchUpdater = batchUpdaterPyMulti(config, 480, 640)
        last_lr = 0
        cur_step = 0
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            for nbatch, data_batch in enumerate(train_data):
                if monitor is not None:
                    monitor.tic()
                # disp weights L2 norm
                cur_lr = self._curr_module._optimizer._get_lr(0)
                if nbatch % (4000 / train_data.batch_size) == 0:
                    all_params = self._curr_module.get_params()[0]
                    all_param_names = all_params.keys()
                    all_param_names = sorted(all_param_names)
                    print_and_log(prefix, self.logger)
                    weight_str = ""
                    for view_name in all_param_names:
                        weight_str += "{}: {} ".format(
                            view_name, nd.norm(all_params[view_name]).asnumpy()
                        )
                    print_and_log(weight_str, self.logger)
                    print_and_log(
                        "batch {}: lr: {}".format(nbatch, cur_lr), self.logger
                    )
                    if config.TRAIN.TENSORBOARD_LOG:
                        summ_writer.add_scalar(
                            tag="learning_rate", value=cur_lr, global_step=cur_step
                        )
                if cur_lr != last_lr:
                    print_and_log(
                        "batch {}: lr: {}".format(nbatch, cur_lr), self.logger
                    )
                    last_lr = cur_lr
                    if config.TRAIN.TENSORBOARD_LOG:
                        summ_writer.add_scalar(
                            tag="learning_rate", value=cur_lr, global_step=cur_step
                        )

                train_iter_size = config.network.TRAIN_ITER_SIZE
                for iter_idx in range(train_iter_size):
                    self.forward_backward(data_batch)
                    preds = self._curr_module.get_outputs(False)
                    self.update()
                    if iter_idx != train_iter_size - 1:
                        data_batch = interBatchUpdater.forward(
                            data_batch, preds, config
                        )
                cur_step += 1
                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(
                        epoch=epoch,
                        nbatch=nbatch,
                        eval_metric=eval_metric,
                        locals=locals(),
                    )
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                if config.TRAIN.TENSORBOARD_LOG:
                    for name, val in eval_metric.get_name_value():
                        summ_writer.add_scalar(
                            tag="BatchTrain-{}".format(name),
                            value=val,
                            global_step=cur_step,
                        )

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info("Epoch[%d] Train-%s=%f", epoch, name, val)
                if config.TRAIN.TENSORBOARD_LOG:
                    summ_writer.add_scalar(
                        tag="EpochTrain-{}".format(name), value=val, global_step=epoch
                    )

            toc = time.time()
            self.logger.info("Epoch[%d] Time cost=%.3f", epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            # ----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(
                    eval_data,
                    validation_metric,
                    score_end_callback=eval_end_callback,
                    batch_end_callback=eval_batch_end_callback,
                    epoch=epoch,
                )
                # TODO: pull this into default
                for name, val in res:
                    self.logger.info("Epoch[%d] Validation-%s=%f", epoch, name, val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 27
0
    def fit(self, train_data, eval_data=None, eval_metric='acc',
            epoch_end_callback=None, batch_end_callback=None, kvstore='local',
            optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
            eval_end_callback=None,
            eval_batch_end_callback=None, initializer=Uniform(0.01),
            arg_params=None, aux_params=None, allow_missing=False,
            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
            validation_metric=None, monitor=None, prefix=None, state=None):
        """Train the module parameters.

        Parameters
        ----------
        train_data : DataIter
        eval_data : DataIter
            If not `None`, will be used as validation set and evaluate the performance
            after each epoch.
        eval_metric : str or EvalMetric
            Default `'acc'`. The performance measure used to display during training.
        epoch_end_callback : function or list of function
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Default `'local'`.
        optimizer : str or Optimizer
            Default `'sgd'`
        optimizer_params : dict
            Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor.
            The default value is not a `dict`, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each minibatch during evaluation
        initializer : Initializer
            Will be called to initialize the module parameters if not already initialized.
        arg_params : dict
            Default `None`, if not `None`, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has higher priority to `initializer`.
        aux_params : dict
            Default `None`. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Default `False`. Indicate whether we allow missing parameters when `arg_params`
            and `aux_params` are not `None`. If this is `True`, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Default `False`. Whether to force rebinding the executors if already binded.
        force_init : bool
            Default `False`. Indicate whether we should force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Default `0`. Indicate the starting epoch. Usually, if we are resuming from a
            checkpoint saved at a previous training phase at epoch N, then we should specify
            this value as N+1.
        num_epoch : int
            Number of epochs to run training.

        Examples
        --------
        An example of using fit for training::
            >>> #Assume training dataIter and validation dataIter are ready
            >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter,
                        optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
                        num_epoch=10)
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
                  for_training=True, force_rebind=force_rebind)
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                         allow_missing=allow_missing, force_init=force_init)
        self.init_optimizer(kvstore=kvstore, optimizer=optimizer,
                            optimizer_params=optimizer_params)
        if state is not None:
            self._curr_module.load_optimizer_states(state)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            for nbatch, data_batch in enumerate(train_data):
                if monitor is not None:
                    monitor.tic()
                self.forward_backward(data_batch)
                self.update()
                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data, validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback, epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 28
0
    path_imgrec = "./segmentation_data/%s_person_datasets/%s_%s_%d,%d.rec" % (
        data_type[0], data_type[0], data_type[1], resize_size[0],
        resize_size[1])
    logging.info('数据集:%s' % path_imgrec)
    idx2imgname = {}
    with open(path_imglst, 'r') as fin:
        idx2imgname = {
            int(line.split('\t')[0]): line.split('\t')[2].strip()
            for line in fin.readlines()
        }
    logging.info('图片数量:%d' % len(idx2imgname))
    # endregion

    eval_metric = 'meanIOU'
    if eval_metric == 'acc':
        eval_metric = metric.create(eval_metric)
    elif eval_metric == 'meanIOU':
        # eval_metric = MeanIoU(c=15, threshold=args.threshold, num_class=21)
        eval_metric = MeanIoU(c=1, threshold=args.threshold, num_class=2)

    visual = args.visual

    if visual:
        result_path = './result/%s_person_datasets/result_%s_epoch%d_%s' % (
            data_type[0], pre_train_model_type, epoch, data_type[1])
        if not os.path.exists(result_path):
            os.makedirs(result_path)
        logging.info('结果保存到:%s' % result_path)
        # 产生一个颜色和mask
        rgb = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
        # print rgb
Ejemplo n.º 29
0
    def fit(self,
            X,
            marks,
            e_marks=None,
            y=None,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            time_step_callback=None,
            kvstore='local',
            logger=None,
            work_load_list=None,
            monitor=None,
            eval_batch_end_callback=None):
        """Overwrite"""

        data = self._init_iter(X, y, is_train=True)
        eval_data = self._init_eval_iter(eval_data)

        if self.sym_gen:
            self.symbol = self.sym_gen(data.default_bucket_key)  # pylint: disable=no-member
            self._check_arguments()
        self.kwargs["sym"] = self.symbol

        param_dict = dict(data.provide_data + data.provide_label)
        arg_names, param_names, aux_names = self._init_params(param_dict)

        # setup metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        # create kvstore
        (kvstore, update_on_kvstore) = _create_kvstore(kvstore, len(self.ctx),
                                                       self.arg_params)

        param_idx2name = {}
        if update_on_kvstore:
            param_idx2name.update(enumerate(param_names))
        else:
            for i, n in enumerate(param_names):
                for k in range(len(self.ctx)):
                    param_idx2name[i * len(self.ctx) + k] = n
        self.kwargs["param_idx2name"] = param_idx2name

        # init optmizer
        if isinstance(self.optimizer, str):
            batch_size = data.batch_size
            if kvstore and kvstore.type == 'dist_sync':
                batch_size *= kvstore.num_workers
            optimizer = opt.create(self.optimizer,
                                   rescale_grad=(1.0 / batch_size),
                                   **(self.kwargs))
        elif isinstance(self.optimizer, opt.Optimizer):
            optimizer = self.optimizer

        # do training
        _train_rnn(self.symbol,
                   self.ctx,
                   marks,
                   arg_names,
                   param_names,
                   aux_names,
                   self.arg_params,
                   self.aux_params,
                   begin_epoch=self.begin_epoch,
                   end_epoch=self.num_epoch,
                   epoch_size=self.epoch_size,
                   optimizer=optimizer,
                   train_data=data,
                   eval_data=eval_data,
                   eval_metric=eval_metric,
                   epoch_end_callback=epoch_end_callback,
                   batch_end_callback=batch_end_callback,
                   time_step_callback=time_step_callback,
                   kvstore=kvstore,
                   update_on_kvstore=update_on_kvstore,
                   logger=logger,
                   work_load_list=work_load_list,
                   monitor=monitor,
                   eval_batch_end_callback=eval_batch_end_callback,
                   sym_gen=self.sym_gen,
                   e_marks=e_marks)
Ejemplo n.º 30
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            best_model_callbacks=None,
            eval_interval=None,
            validation_metric=None,
            monitor=None):

        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind)

        if monitor is not None:
            self.install_monitor(monitor)

        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)

        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        if validation_metric is None:
            validation_metric = copy.deepcopy(eval_metric)

        epoch_metric = copy.deepcopy(eval_metric)

        swa_arg_params = None
        swa_aux_params = None
        swa_cnt = 0

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic_epoch = time.time()
            eval_metric.reset()

            nbatch = 0
            end_of_batch = False
            data_iter = iter(train_data)
            next_data_batch = next(data_iter)
            name_values = []

            while not end_of_batch:
                data_batch = next_data_batch

                if monitor is not None:
                    monitor.tic()

                self.forward_backward(data_batch)
                self.update()

                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)
                if end_of_batch:
                    name_values = eval_metric.get_name_value()

                if monitor is not None:
                    monitor.toc_print()

                nbatch += 1

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)

                    eval_metric.reset()

                # ----------------------------------------
                # evaluation on validation set
                to_go = eval_interval is not None and nbatch % eval_interval == 0
                if to_go and eval_data:
                    res = self.score(
                        eval_data,
                        validation_metric,
                        score_end_callback=eval_end_callback,
                        batch_end_callback=eval_batch_end_callback,
                        epoch=epoch)
                    for name, val in res:
                        self.logger.info(
                            'Epoch[%d] Batch[%d] Validation-%s=%f', epoch,
                            nbatch, name, val)

                    if best_model_callbacks is not None:
                        for callback in _as_list(best_model_callbacks):
                            if callback.is_best(validation_metric):
                                # sync aux params across devices
                                arg_params, aux_params = self.get_params()
                                sync_made = True
                                callback.checkpoint_if_only_best(
                                    validation_metric, self.symbol, arg_params,
                                    aux_params)
                                break

            # one epoch of training is finished
            for name, val in name_values:
                self.logger.info('Epoch[%d] Train-%s=%f', epoch + 1, name, val)
            toc_epoch = time.time()
            elapsed = (toc_epoch - tic_epoch)
            avg_speed = float(len(train_data)) / (toc_epoch - tic_epoch)
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch + 1, elapsed)
            self.logger.info('Epoch[%d] Average speed=%.3f samples/sec',
                             epoch + 1, avg_speed)

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch + 1)
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch + 1,
                                     name, val)

                if best_model_callbacks is not None:
                    for callback in _as_list(best_model_callbacks):
                        callback.checkpoint_if_only_best(
                            validation_metric, self.symbol, arg_params,
                            aux_params)

            # end of epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 31
0
    def fit(self, train_data, eval_data=None, eval_metric='acc',
            epoch_end_callback=None, batch_end_callback=None, kvstore='local',
            optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
            eval_end_callback=None,
            eval_batch_end_callback=None, initializer=Uniform(0.01),
            arg_params=None, aux_params=None, allow_missing=False,
            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
            validation_metric=None, monitor=None, prefix=None,
            batches_checkpoint=None, num_batches_save_ckpt=2000):
        """Train the module parameters.

        Parameters
        ----------
        train_data : DataIter
        eval_data : DataIter
            If not `None`, will be used as validation set and evaluate the performance
            after each epoch.
        eval_metric : str or EvalMetric
            Default `'acc'`. The performance measure used to display during training.
        epoch_end_callback : function or list of function
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Default `'local'`.
        optimizer : str or Optimizer
            Default `'sgd'`
        optimizer_params : dict
            Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor.
            The default value is not a `dict`, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each minibatch during evaluation
        initializer : Initializer
            Will be called to initialize the module parameters if not already initialized.
        arg_params : dict
            Default `None`, if not `None`, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has higher priority to `initializer`.
        aux_params : dict
            Default `None`. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Default `False`. Indicate whether we allow missing parameters when `arg_params`
            and `aux_params` are not `None`. If this is `True`, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Default `False`. Whether to force rebinding the executors if already binded.
        force_init : bool
            Default `False`. Indicate whether we should force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Default `0`. Indicate the starting epoch. Usually, if we are resuming from a
            checkpoint saved at a previous training phase at epoch N, then we should specify
            this value as N+1.
        num_epoch : int
            Number of epochs to run training.

        Examples
        --------
        An example of using fit for training::
            >>> #Assume training dataIter and validation dataIter are ready
            >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter,
                        optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
                        num_epoch=10)
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
                  for_training=True, force_rebind=force_rebind)
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                         allow_missing=allow_missing, force_init=force_init)
        self.init_optimizer(kvstore=kvstore, optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            for nbatch, data_batch in enumerate(train_data):
                if monitor is not None:
                    monitor.tic()
                self.forward_backward(data_batch)
                self.update()
                self.update_metric(eval_metric, data_batch.label)
                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)

                if batches_checkpoint is not None and nbatch != 0 and nbatch % num_batches_save_ckpt == 0:
                    for callback in _as_list(epoch_end_callback):
                        callback(epoch, self.symbol, arg_params, aux_params)

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)
            if prefix is not None:
                self._curr_module.save_checkpoint(prefix, epoch + 1, save_optimizer_states=True)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data, validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback, epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Ejemplo n.º 32
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            iter_size=1,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None):
        """Ke's revision: add iter_size. Trains the module parameters.

        Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see
        a end-to-end use-case.

        Parameters
        ----------
        train_data : DataIter
            Train DataIter.
        eval_data : DataIter
            If not ``None``, will be used as validation set and the performance
            after each epoch will be evaluated.
        eval_metric : str or EvalMetric
            Defaults to 'accuracy'. The performance measure used to display during training.
            Other possible predefined metrics are:
            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
        epoch_end_callback : function or list of functions
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Defaults to 'local'.
        optimizer : str or Optimizer
            Defaults to 'sgd'.
        optimizer_params : dict
            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
            the optimizer constructor.
            The default value is not a dict, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each mini-batch during evaluation.
        initializer : Initializer
            The initializer is called to initialize the module parameters when they are
            not already initialized.
        arg_params : dict
            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has a higher priority than `initializer`.
        aux_params : dict
            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Defaults to ``False``. Whether to force rebinding the executors if already bound.
        force_init : bool
            Defaults to ``False``. Indicates whether to force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
            checkpoint saved at a previous training phase at epoch N, then this value should be
            N+1.
        num_epoch : int
            Number of epochs for training.

        Examples
        --------
        >>> # An example of using fit for training.
        >>> # Assume training dataIter and validation dataIter are ready
        >>> # Assume loading a previously checkpointed model
        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)
        >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd',
        ...     optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
        ...     arg_params=arg_params, aux_params=aux_params,
        ...     eval_metric='acc', num_epoch=10, begin_epoch=3)
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind,
                  grad_req='add')
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)

            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                # self.forward_backward(data_batch)
                self.forward(data_batch, is_train=True, grad_req='add')
                self.backward()
                if nbatch % iter_size == 0:  # update every iter_size batches
                    self.update()
                    for g in self._curr_module._exec_group.grad_arrays:
                        for g1 in g:
                            if g1 is not None:
                                g1[:] = 0.

                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()