Ejemplo n.º 1
0
def test(ctx=mx.cpu()):
    from mxboard import SummaryWriter
    sw = SummaryWriter(logdir='sphere_dynamic', flush_secs=5)

    net = nn.Sequential()
    b1 = base_net(48,
                  3,
                  fun=special_conv,
                  kernel_size=(3, 3),
                  same_shape=False)
    b2 = base_net(1,
                  48,
                  fun=special_conv,
                  kernel_size=(3, 3),
                  same_shape=False)
    fc = nn.Dense(3, in_units=9)
    net.add(b1, b2, fc)
    init_s(net, ctx)

    from mxnet import gluon, autograd
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': 0.01})
    for i in range(10000):
        with autograd.record():
            out = net(img)
            loss = nd.sum(nd.abs(out - target))
        loss.backward()
        trainer.step(2)
        sw.add_scalar(tag='loss', value=loss.asscalar(), global_step=i)
        if i % 100 == 0:
            print i, loss.asscalar()
    sw.close()
Ejemplo n.º 2
0
def test_add_scalar():
    sw = SummaryWriter(logdir=_LOGDIR)
    sw.add_scalar(tag='test_add_scalar',
                  value=np.random.uniform(),
                  global_step=0)
    sw.close()
    check_event_file_and_remove_logdir()
Ejemplo n.º 3
0
class CallBackLogging(object):
    def __init__(self, rank, size, prefix_dir):
        self.batch_size = config.batch_size
        self.rank = rank
        self.size = size
        self.prefix_dir = prefix_dir
        self.frequent = config.frequent
        self.init = False
        self.tic = 0
        self.last_count = 0
        self.loss_metric = MetricNdarray()
        t = time.localtime()

        self.summary_writer = SummaryWriter(
            logdir=os.path.join(self.prefix_dir, "log_tensorboard",
            "%s_%s_%s" % (str(t.tm_mon), str(t.tm_mday), str(t.tm_hour))), verbose=False)

    def __call__(self, param):
        """Callback to Show speed
        """
        count = param.num_update

        if self.last_count > count:
            self.init = False
        self.last_count = count

        self.loss_metric.update(param.loss[0])

        if self.init:
            if count % self.frequent == 0:
                nd.waitall()
                try:
                    speed = self.frequent * self.batch_size / (time.time() - self.tic)
                    speed_total = speed * self.size
                except ZeroDivisionError:
                    speed = float('inf')
                    speed_total = float('inf')

                # summary loss
                loss_scalar = self.loss_metric.get()
                self.summary_writer.add_scalar(tag="loss",
                    value=loss_scalar, global_step=param.num_update)
                loss_str_format = "[%d][%s]:%.2f " % (param.num_epoch, "loss", loss_scalar)
                self.loss_metric.reset()
                # summary speed
                self.summary_writer.add_scalar(
                    tag="speed",
                    value=speed, global_step=param.num_update)
                self.summary_writer.flush()
                if self.rank == 0:
                    logging.info(
                        "Iter:%d Rank:%.2f it/sec Total:%.2f it/sec %s",
                        param.num_update, speed, speed_total, loss_str_format)
                self.tic = time.time()
        else:
            self.init = True
            self.tic = time.time()
Ejemplo n.º 4
0
def train(transformer, data_iter, lr, num_epochs, vocab, ctx):
    print('start training')
    print('ctx:', ctx)

    trainer = gluon.Trainer(transformer.collect_params(), 'adam', {'learning_rate': lr})
    loss = gloss.SoftmaxCrossEntropyLoss()

    best_epoch = 0
    best_loss = float('Inf')
    sw = SummaryWriter(logdir='../logs', flush_secs=5)

    for epoch in range(num_epochs):
        l_sum = 0.0
        for i, data in enumerate(data_iter):
            X, Y, label, X_valid_len, Y_valid_len = data
            # X = X.as_in_context(ctx)
            # Y = Y.as_in_context(ctx)
            # label = label.as_in_context(ctx)
            gpu_Xs = gutils.split_and_load(X, ctx, even_split=False)
            gpu_Ys = gutils.split_and_load(Y, ctx, even_split=False)
            gpu_labels = gutils.split_and_load(label, ctx, even_split=False)

            with autograd.record():
                # l = batch_loss(transformer, X, Y, vocab, loss)
                ls = [batch_loss(transformer, gpu_X, gpu_Y, gpu_label, vocab, loss)
                      for gpu_X, gpu_Y, gpu_label in zip(gpu_Xs, gpu_Ys, gpu_labels)]

            # l.backward()
            b_loss = 0.0
            for l in ls:
                l.backward()
                b_loss += l.asscalar()
            trainer.step(X.shape[0])
            nd.waitall()

            l_sum += b_loss
            if i % 100 == 0:
                info = "epoch %d, batch %d, batch_loss %.3f" % (epoch, i, b_loss)
                print(info)
                sw.add_scalar(tag='batch_loss', value=b_loss, global_step=i)

        cur_loss = l_sum / len(data_iter)
        # 保存模型
        if cur_loss < best_loss:
            best_loss = cur_loss
            best_epoch = epoch
            if not os.path.exists('../model'):
                os.mkdir('../model')
            transformer.save_parameters('../model/transformer' + str(epoch) + '.params')

        info = "epoch %d, loss %.3f, best_loss %.3f, best_epoch %d" % (
            epoch, cur_loss, best_loss, best_epoch)
        print(info)
        sw.add_scalar(tag='loss', value=cur_loss, global_step=epoch)
Ejemplo n.º 5
0
Archivo: rl.py Proyecto: tsuberim/RL
class TensorboardStatsWriter(StatsWriter):
    def __init__(self, save_path=None, **kwargs):
        if not save_path:
            raise ValueError('save_path not specified')
        from mxboard import SummaryWriter
        logdir = save_path
        save_path = os.path.join(logdir, 'save')
        os.makedirs(logdir, exist_ok=True)
        self.sw = SummaryWriter(logdir=logdir)
        StatsWriter.__init__(self, save_path=save_path, **kwargs)

    def _write(self, idx, key, value):
        self.sw.add_scalar(key, value, idx)
Ejemplo n.º 6
0
class TensorboardCallback(object):
    """Log metrics periodically in TensorBoard.
    This callback works almost same as `callback.Speedometer`,
    but write TensorBoard event file for visualization.
    For more usage, please refer https://github.com/dmlc/tensorboard
    Parameters
    ----------
    logging_dir : str
        TensorBoard event file directory.
        After that, use `tensorboard --logdir=path/to/logs` to launch
        TensorBoard visualization.
    prefix : str
        Prefix for a metric name of `scalar` value.
        You might want to use this param to leverage TensorBoard plot feature,
        where TensorBoard plots different curves in one graph when they have
          same `name`.
        The follow example shows the usage(how to compare a train and eval
          metric in a same graph).
    """
    def __init__(self, logging_dir, total_step=0, prefix=None):
        self.prefix = prefix
        self.step = total_step
        try:
            from mxboard import SummaryWriter
            self.summary_writer = SummaryWriter(logging_dir)
        except ImportError:
            logging.error(
                'You can install tensorboard via `pip install mxboard`.')

    def __call__(self, param, name_value=None):
        """Callback to log training speed and metrics in TensorBoard."""
        if name_value:
            self._add_scalar(name_value)

        if param.eval_metric is None:
            return

        # if param.add_step:
        self.step += 1

        name_value = param.eval_metric.get_name_value()
        if name_value:
            self._add_scalar(name_value)

    def _add_scalar(self, name_value):
        for name, value in name_value:
            if self.prefix is not None:
                name = '%s-%s' % (self.prefix, name)
            self.summary_writer.add_scalar(tag=name,
                                           value=value,
                                           global_step=self.step)
Ejemplo n.º 7
0
class LogMetricsCallback(object):
    """Log metrics periodically in TensorBoard.
    This callback works almost same as `callback.Speedometer`, but write TensorBoard event file
    for visualization. For more usage, please refer https://github.com/dmlc/tensorboard

    Parameters
    ----------
    logging_dir : str
        TensorBoard event file directory.
        After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization.
    prefix : str
        Prefix for a metric name of `scalar` value.
        You might want to use this param to leverage TensorBoard plot feature,
        where TensorBoard plots different curves in one graph when they have same `name`.
        The follow example shows the usage(how to compare a train and eval metric in a same graph).

    Examples
    --------
    >>> # log train and eval metrics under different directories.
    >>> training_log = 'logs/train'
    >>> evaluation_log = 'logs/eval'
    >>> # in this case, each training and evaluation metric pairs has same name,
    >>> # you can add a prefix to make it separate.
    >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)]
    >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)]
    >>> # run
    >>> model.fit(train,
    >>>     ...
    >>>     batch_end_callback = batch_end_callbacks,
    >>>     eval_end_callback  = eval_end_callbacks)
    >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization.
    """
    def __init__(self, logging_dir, prefix=None):
        self.prefix = prefix
        try:
            from mxboard import SummaryWriter
            self.summary_writer = SummaryWriter(logging_dir)
        except ImportError:
            logging.error('You can install mxboard via `pip install mxboard`.')

    def __call__(self, param):
        """Callback to log training speed and metrics in TensorBoard."""
        if param.eval_metric is None:
            return
        name_value = param.eval_metric.get_name_value()
        for name, value in name_value:
            if self.prefix is not None:
                name = '%s-%s' % (self.prefix, name)
            self.summary_writer.add_scalar(name,
                                           value,
                                           global_step=param.epoch)
Ejemplo n.º 8
0
def test_add_multiple_scalars():
    sw = SummaryWriter(logdir=_LOGDIR)
    sw.add_scalar(tag='test_multiple_scalars',
                  value=np.random.uniform(),
                  global_step=0)
    sw.add_scalar(tag='test_multiple_scalars',
                  value=('scalar1', np.random.uniform()),
                  global_step=0)
    sw.add_scalar(tag='test_multiple_scalars',
                  value=['scalar2', np.random.uniform()],
                  global_step=0)
    sw.add_scalar(tag='test_multiple_scalars',
                  value={
                      'scalar3': np.random.uniform(),
                      'scalar4': np.random.uniform()
                  },
                  global_step=0)
    items = os.listdir(_LOGDIR)
    assert len(items) == 2
    assert 'test_multiple_scalars' in items
    items.remove('test_multiple_scalars')
    assert items[0].startswith(_EVENT_FILE_PREFIX)
    print(items[0])
    assert file_exists(os.path.join(_LOGDIR, items[0]))

    named_scalar_dir = os.path.join(_LOGDIR, 'test_multiple_scalars')
    assert dir_exists(named_scalar_dir)
    for i in range(1, 5):
        sub_dir = os.path.join(named_scalar_dir, 'scalar%d' % i)
        assert dir_exists(sub_dir)
        sub_items = os.listdir(sub_dir)
        assert len(sub_items) == 1
        assert sub_items[0].startswith(_EVENT_FILE_PREFIX)
Ejemplo n.º 9
0
def main(lstm,train_data,train_label):
    batch_size = 30
    epochs = 2000
    moving_loss = 0.
    learning_rate = 0.1
    sw = SummaryWriter(logdir='./logs', flush_secs=5)
    global_step = 0
    for e in range(epochs):
        # if e == 0:
        #     net = lstm.lstm_rnn(input_sequence, h, c, temperature=temperature)
        #     sw.add_graph(lstm)
        if ((e+1) %50 == 0):
            learning_rate = learning_rate /2.0
        h = nd.zeros(shape=(batch_size,num_hidden))
        c = nd.zeros(shape=(batch_size,num_hidden))
        num_batches =2
        for i in range(num_batches):
            data_one_hot = train_data[i]
            label_one_hot = train_label[i]
            data_one_hot.attach_grad()
            label_one_hot.attach_grad()
            with autograd.record():
                outputs,h,c = lstm.lstm_rnn(inputs=data_one_hot,h=h,c=c)
                loss = lstm.average_ce_loss(outputs,label_one_hot)
                sw.add_scalar(tag='cross_entropy', value=loss.mean().asscalar(), global_step=global_step)
                loss.backward()
                global_step = global_step + 1


            lstm.SGD(learning_rate)
            if learning_rate % 20 == 0:
                learning_rate = learning_rate * 0.1
            if ( i == 0 ) and (e == 0):
                moving_loss = nd.mean(loss).asscalar()
            else:
                moving_loss = .99*moving_loss + .01*nd.mean(loss).asscalar()

        sw.add_scalar(tag='Loss', value=moving_loss, global_step=e)
        print("Epoch %s. Loss: %s" % (e, moving_loss))
        print(sample("1 2 3 ", 10,h,c, temperature=.1))
        print(sample("This eBook is for the use of anyone ", 10,h,c, temperature=.1))
Ejemplo n.º 10
0
class HistoryKeeper():
    def __init__(self, logdir, keys=['val_acc', 'val_loss']):
        if not isinstance(keys, (list, tuple)):
            raise ValueError("Keys should be a list or a tuple.")
        self.keys = keys

        self.sw = SummaryWriter(logdir=os.path.join(logdir, 'tb'))
        self.csv_path = os.path.join(logdir, 'history.csv')

        with open(self.csv_path, 'w') as f:
            f.write(";".join(keys) + "\n")

    # Return True to interrupt training
    def __call__(self, model, params):
        epoch = params['epoch']
        pars_ = []
        for key in self.keys:
            if key in params:
                self.sw.add_scalar(key, params[key], epoch)
                pars_.append(str(params[key]))

        with open(self.csv_path, 'a') as f:
            f.write(";".join(pars_) + "\n")
Ejemplo n.º 11
0
def do_training(args, module, data_train, data_val, begin_epoch=0):
    from distutils.dir_util import mkpath
    from log_util import LogUtil

    log = LogUtil().getlogger()
    mkpath(os.path.dirname(get_checkpoint_path(args)))

    #seq_len = args.config.get('arch', 'max_t_count')
    batch_size = args.config.getint('common', 'batch_size')
    save_checkpoint_every_n_epoch = args.config.getint(
        'common', 'save_checkpoint_every_n_epoch')
    save_checkpoint_every_n_batch = args.config.getint(
        'common', 'save_checkpoint_every_n_batch')
    enable_logging_train_metric = args.config.getboolean(
        'train', 'enable_logging_train_metric')
    enable_logging_validation_metric = args.config.getboolean(
        'train', 'enable_logging_validation_metric')

    contexts = parse_contexts(args)
    num_gpu = len(contexts)
    eval_metric = STTMetric(batch_size=batch_size,
                            num_gpu=num_gpu,
                            is_logging=enable_logging_validation_metric,
                            is_epoch_end=True)
    # mxboard setting
    loss_metric = STTMetric(batch_size=batch_size,
                            num_gpu=num_gpu,
                            is_logging=enable_logging_train_metric,
                            is_epoch_end=False)

    optimizer = args.config.get('optimizer', 'optimizer')
    learning_rate = args.config.getfloat('train', 'learning_rate')
    learning_rate_annealing = args.config.getfloat('train',
                                                   'learning_rate_annealing')

    mode = args.config.get('common', 'mode')
    num_epoch = args.config.getint('train', 'num_epoch')
    clip_gradient = args.config.getfloat('optimizer', 'clip_gradient')
    weight_decay = args.config.getfloat('optimizer', 'weight_decay')
    save_optimizer_states = args.config.getboolean('train',
                                                   'save_optimizer_states')
    show_every = args.config.getint('train', 'show_every')
    optimizer_params_dictionary = json.loads(
        args.config.get('optimizer', 'optimizer_params_dictionary'))
    kvstore_option = args.config.get('common', 'kvstore_option')
    n_epoch = begin_epoch
    is_bucketing = args.config.getboolean('arch', 'is_bucketing')

    if clip_gradient == 0:
        clip_gradient = None
    if is_bucketing and mode == 'load':
        model_file = args.config.get('common', 'model_file')
        model_name = os.path.splitext(model_file)[0]
        model_num_epoch = int(model_name[-4:])

        model_path = 'checkpoints/' + str(model_name[:-5])
        symbol, data_names, label_names = module(1600)
        model = STTBucketingModule(
            sym_gen=module,
            default_bucket_key=data_train.default_bucket_key,
            context=contexts)
        data_train.reset()

        model.bind(data_shapes=data_train.provide_data,
                   label_shapes=data_train.provide_label,
                   for_training=True)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            model_path, model_num_epoch)
        model.set_params(arg_params, aux_params)
        module = model
    else:
        module.bind(data_shapes=data_train.provide_data,
                    label_shapes=data_train.provide_label,
                    for_training=True)

    if begin_epoch == 0 and mode == 'train':
        module.init_params(initializer=get_initializer(args))

    lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate)

    def reset_optimizer(force_init=False):
        optimizer_params = {
            'lr_scheduler': lr_scheduler,
            'clip_gradient': clip_gradient,
            'wd': weight_decay
        }
        optimizer_params.update(optimizer_params_dictionary)
        module.init_optimizer(kvstore=kvstore_option,
                              optimizer=optimizer,
                              optimizer_params=optimizer_params,
                              force_init=force_init)

    if mode == "train":
        reset_optimizer(force_init=True)
    else:
        reset_optimizer(force_init=False)
        data_train.reset()
        data_train.is_first_epoch = True

    #mxboard setting
    mxlog_dir = args.config.get('common', 'mxboard_log_dir')
    summary_writer = SummaryWriter(mxlog_dir)

    while True:

        if n_epoch >= num_epoch:
            break
        loss_metric.reset()
        log.info('---------train---------')
        for nbatch, data_batch in enumerate(data_train):
            module.forward_backward(data_batch)
            module.update()
            # mxboard setting
            if (nbatch + 1) % show_every == 0:
                module.update_metric(loss_metric, data_batch.label)
            #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch)
            if (nbatch + 1) % save_checkpoint_every_n_batch == 0:
                log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch,
                         nbatch)
                module.save_checkpoint(
                    prefix=get_checkpoint_path(args) + "n_epoch" +
                    str(n_epoch) + "n_batch",
                    epoch=(int(
                        (nbatch + 1) / save_checkpoint_every_n_batch) - 1),
                    save_optimizer_states=save_optimizer_states)
        # commented for Libri_sample data set to see only train cer
        log.info('---------validation---------')
        data_val.reset()
        eval_metric.reset()
        for nbatch, data_batch in enumerate(data_val):
            # when is_train = False it leads to high cer when batch_norm
            module.forward(data_batch, is_train=True)
            module.update_metric(eval_metric, data_batch.label)

        # mxboard setting
        val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value()
        log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer,
                 int(val_n_label - val_l_dist), val_n_label)
        curr_acc = val_cer
        summary_writer.add_scalar('CER validation', val_cer, n_epoch)
        assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric'

        data_train.reset()
        data_train.is_first_epoch = False

        # mxboard setting
        train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value(
        )
        summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch)
        summary_writer.add_scalar('CER train', train_cer, n_epoch)

        # save checkpoints
        if n_epoch % save_checkpoint_every_n_epoch == 0:
            log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch)
            module.save_checkpoint(prefix=get_checkpoint_path(args),
                                   epoch=n_epoch,
                                   save_optimizer_states=save_optimizer_states)

        n_epoch += 1

        lr_scheduler.learning_rate = learning_rate / learning_rate_annealing

    log.info('FINISH')
Ejemplo n.º 12
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        offset_alloc_size=(64, 64),
        anchors={"shallow": [(10, 13), (16, 30), (33, 23)],
                 "middle": [(30, 61), (62, 45), (59, 119)],
                 "deep": [(116, 90), (156, 198), (373, 326)]},
        graphviz=False,
        epoch=100,
        input_size=[416, 416],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=False,
        factor_scale=[13, 5],
        ignore_threshold=0.5,
        dynamic=False,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        save_period=5,
        load_period=10,
        learning_rate=0.001, decay_lr=0.999, decay_step=10,
        GPU_COUNT=0,
        Darknetlayer=53,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB')
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB')
        else:
            logging.info(f'Running on {ctx}')

    # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함
    if input_size[0] % 32 != 0 and input_size[1] % 32 != 0:
        logging.info("The input size must be a multiple of 32")
        exit(0)

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training YoloV3 Detector")
    input_shape = (1, 3) + tuple(input_size)

    try:
        net = Yolov3(Darknetlayer=Darknetlayer,
                     anchors=anchors,
                     pretrained=False,
                     ctx=mx.cpu())
        train_dataloader, train_dataset = traindataloader(multiscale=multiscale,
                                                          factor_scale=factor_scale,
                                                          augmentation=data_augmentation,
                                                          path=train_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=batch_size,
                                                          batch_interval=batch_interval,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)
        valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path,
                                                          input_size=input_size,
                                                          batch_size=valid_size,
                                                          num_workers=num_workers,
                                                          shuffle=True, mean=mean, std=std,
                                                          net=net, ignore_threshold=ignore_threshold, dynamic=dynamic,
                                                          from_sigmoid=False, make_target=True)

    except Exception:
        logging.info("dataset 없음")
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer)
    else:
        model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path,
                                        ['data'],
                                        param_path, ctx=ctx)
    else:
        start_epoch = 0
        '''
        mxnet c++에서 arbitrary input image 를 받기 위한 전략
        alloc_size : tuple of int, default is (128, 128)
        For advanced users. Define `alloc_size` to generate large enough offset
        maps, which will later saved in parameters. During inference, we support arbitrary
        input image by cropping corresponding area of the anchor map. This allow us
        to export to symbol so we can run it in c++, Scalar, etc.
        '''
        net = Yolov3(Darknetlayer=Darknetlayer,
                     input_size=input_size,
                     anchors=anchors,
                     num_classes=num_classes,  # foreground만
                     pretrained=pretrained_base,
                     pretrained_path=pretrained_path,
                     alloc_size=offset_alloc_size,
                     ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))

        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False},
                                    update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "beta1": 0.9,
                                                                                       "beta2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "gamma1": 0.9,
                                                                                       "gamma2": 0.999,
                                                                                       'multi_precision': False})
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate,
                                                                                       "lr_scheduler": lr_sch,
                                                                                       "wd": 0.0005,
                                                                                       "momentum": 0.9,
                                                                                       'multi_precision': False})

        else:
            logging.error("optimizer not selected")
            exit(0)

    loss = Yolov3Loss(sparse_label=True,
                      from_sigmoid=False,
                      batch_axis=None,
                      num_classes=num_classes,
                      reduction="sum",
                      exclude=False)

    prediction = Prediction(
        from_sigmoid=False,
        num_classes=num_classes,
        nms_thresh=nms_thresh,
        nms_topk=nms_topk,
        except_class_thresh=except_class_thresh,
        multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch):

        xcyc_loss_sum = 0
        wh_loss_sum = 0
        object_loss_sum = 0
        class_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate(
                train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0)
            wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0)
            objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0)
            class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0)
            weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0)

            if subdivision == 1:
                image = [image]
                xcyc_all = [xcyc_all]
                wh_all = [wh_all]
                objectness_all = [objectness_all]
                class_all = [class_all]
                weights_all = [weights_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                xcyc_all_losses = []
                wh_all_losses = []
                object_all_losses = []
                class_all_losses = []

                for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image,
                                                                                                           xcyc_all,
                                                                                                           wh_all,
                                                                                                           objectness_all,
                                                                                                           class_all,
                                                                                                           weights_all):

                    if GPU_COUNT <= 1:
                        image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False)
                    else:
                        image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False)
                        xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False)
                        wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False)
                        objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False)
                        class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False)
                        weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False)

                    xcyc_losses = []
                    wh_losses = []
                    object_losses = []
                    class_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split,
                                                                                              wh_split,
                                                                                              objectness_split,
                                                                                              class_split,
                                                                                              weights_split):
                        output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                            img)
                        xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                           wh_target, objectness,
                                                                           class_target, weights)
                        xcyc_losses.append(xcyc_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())
                        object_losses.append(object_loss.asscalar())
                        class_losses.append(class_loss.asscalar())
                        total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss)
                    if AMP:
                        with amp.scale_loss(total_loss, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    xcyc_all_losses.append(sum(xcyc_losses))
                    wh_all_losses.append(sum(wh_losses))
                    object_all_losses.append(sum(object_losses))
                    class_all_losses.append(sum(class_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size
            object_loss_sum += sum(object_all_losses) / td_batch_size
            class_loss_sum += sum(class_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                             f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                             f'[Lr = {trainer.learning_rate}]'
                             f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]'
                             f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]'
                             f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]'
                             f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch)
        train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch)
        train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch)
        train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean
        logging.info(
            f"train xcyc loss : {train_xcyc_loss_mean} / "
            f"train wh loss : {train_wh_loss_mean} / "
            f"train object loss : {train_object_loss_mean} / "
            f"train class loss : {train_class_loss_mean} / "
            f"train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            xcyc_loss_sum = 0
            wh_loss_sum = 0
            object_loss_sum = 0
            class_loss_sum = 0

            # loss 구하기
            for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader:
                vd_batch_size, _, height, width = image.shape

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)
                    xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False)
                    wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False)
                    objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False)
                    class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False)
                    weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False)

                xcyc_losses = []
                wh_losses = []
                object_losses = []
                class_losses = []
                total_loss = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all,
                                                                                              wh_all, objectness_all,
                                                                                              class_all, weights_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]

                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2,
                                                 offset3, stride1, stride2, stride3)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target,
                                                                       wh_target, objectness,
                                                                       class_target, weights)
                    xcyc_losses.append(xcyc_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())
                    object_losses.append(object_loss.asscalar())
                    class_losses.append(class_loss.asscalar())
                    total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses)

                xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size
                object_loss_sum += sum(object_losses) / vd_batch_size
                class_loss_sum += sum(class_losses) / vd_batch_size

            valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch)
            valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch)
            valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch)
            valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean

            logging.info(
                f"valid xcyc loss : {valid_xcyc_loss_mean} / "
                f"valid wh loss : {valid_wh_loss_mean} / "
                f"valid object loss : {valid_object_loss_mean} / "
                f"valid class loss : {valid_class_loss_mean} / "
                f"valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list()
            for j, c, p, r in zip(range(len(recall)), class_name, precision, recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%")
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _, _ = next(dataloader_iter)
                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx], even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image, ctx, even_split=False)
                    label = gluon.utils.split_and_load(label, ctx, even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net(
                        img)
                    ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1,
                                                     offset2, offset3, stride1, stride2, stride3)

                    for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose(
                            (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None,
                                                 reverse_rgb=True,
                                                 class_names=valid_dataset.classes, absolute_coordinates=True,
                                                 colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id,
                                                   thresh=plot_class_thresh,
                                                   reverse_rgb=False,
                                                   class_names=valid_dataset.classes, absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i)

                summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean,
                                                         "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i)
                summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean,
                                                         "valid_wh_loss": valid_wh_loss_mean}, global_step=i)
                summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean,
                                                             "valid_object_loss": valid_object_loss_mean},
                                   global_step=i)
                summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean,
                                                            "valid_class_loss": valid_class_loss_mean}, global_step=i)

                summary.add_scalar(tag="total_loss", value={
                    "train_total_loss": train_total_loss_mean,
                    "valid_total_loss": valid_total_loss_mean},
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default')

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''

            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)

            try:
                net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True)  # for onnx
                net.save_parameters(os.path.join(weight_path, f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함.
                export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                                           block=postnet,
                                           data_shape=tuple(input_size) + tuple((3,)),
                                           epoch=i,
                                           preprocess=True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                                           layout='HWC',
                                           ctx=context,
                                           remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Ejemplo n.º 13
0
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay):
    """Perform CapsNet training"""
    summary_writer = SummaryWriter(args.tblog_dir)
    lr_scheduler = SimpleLRScheduler(learning_rate)
    optimizer_params = {'lr_scheduler': lr_scheduler}
    module.init_params()
    module.init_optimizer(kvstore=kvstore,
                          optimizer=optimizer,
                          optimizer_params=optimizer_params)
    n_epoch = 0
    while True:
        if n_epoch >= num_epoch:
            break
        train_iter.reset()
        val_iter.reset()
        loss_metric.reset()
        for n_batch, data_batch in enumerate(train_iter):
            module.forward_backward(data_batch)
            module.update()
            module.update_metric(loss_metric, data_batch.label)
            loss_metric.get_batch_log(n_batch)
        train_acc, train_loss, train_recon_err = loss_metric.get_name_value()
        loss_metric.reset()
        for n_batch, data_batch in enumerate(val_iter):
            module.forward(data_batch)
            module.update_metric(loss_metric, data_batch.label)
            loss_metric.get_batch_log(n_batch)
        val_acc, val_loss, val_recon_err = loss_metric.get_name_value()

        summary_writer.add_scalar('train_acc', train_acc, n_epoch)
        summary_writer.add_scalar('train_loss', train_loss, n_epoch)
        summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch)
        summary_writer.add_scalar('val_acc', val_acc, n_epoch)
        summary_writer.add_scalar('val_loss', val_loss, n_epoch)
        summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch)

        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss,
                                                                        train_recon_err))
        print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err))
        print('SAVE CHECKPOINT')

        module.save_checkpoint(prefix=model_prefix, epoch=n_epoch)
        n_epoch += 1
        lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
Ejemplo n.º 14
0
def train(epochs, ctx):
    # Collect all parameters from net and its children, then initialize them.
    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    net.hybridize()

    # Trainer is for updating parameters with gradient.
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': opt.lr, 'momentum': opt.momentum})
    metric = mx.metric.Accuracy()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    # collect parameter names for logging the gradients of parameters in each epoch
    params = net.collect_params()
    param_names = params.keys()

    # define a summary writer that logs data and flushes to the file every 5 seconds
    sw = SummaryWriter(logdir='./logs', flush_secs=5)

    global_step = 0
    for epoch in range(epochs):
        # reset data iterator and metric at begining of epoch.
        metric.reset()
        for i, (data, label) in enumerate(train_data):
            # Copy data to ctx if necessary
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            # Start recording computation graph with record() section.
            # Recorded graphs can then be differentiated with backward.
            with autograd.record():
                output = net(data)
                L = loss(output, label)
            sw.add_scalar(tag='cross_entropy', value=L.mean().asscalar(), global_step=global_step)
            global_step += 1
            L.backward()

            # take a gradient step with batch_size equal to data.shape[0]
            trainer.step(data.shape[0])
            # update metric at last.
            metric.update([label], [output])

            if i % opt.log_interval == 0 and i > 0:
                name, train_acc = metric.get()
                print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, train_acc))

            # Log the first batch of images of each epoch
            if i == 0:
                sw.add_image('minist_first_minibatch', data.reshape((opt.batch_size, 1, 28, 28)), epoch)

        if epoch == 0:
            sw.add_graph(net)

        grads = [i.grad() for i in net.collect_params().values()]
        assert len(grads) == len(param_names)
        # logging the gradients of parameters for checking convergence
        for i, name in enumerate(param_names):
            sw.add_histogram(tag=name, values=grads[i], global_step=epoch, bins=1000)

        name, train_acc = metric.get()
        print('[Epoch %d] Training: %s=%f' % (epoch, name, train_acc))
        # logging training accuracy
        sw.add_scalar(tag='accuracy_curves', value=('train_acc', train_acc), global_step=epoch)

        name, val_acc = test(ctx)
        print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc))
        # logging the validation accuracy
        sw.add_scalar(tag='accuracy_curves', value=('valid_acc', val_acc), global_step=epoch)

    sw.export_scalars('scalar_dict.json')
    sw.close()
Ejemplo n.º 15
0
class ModuleLearner():
    def __init__(self,
                 model,
                 run_id,
                 gpu_idxs=None,
                 tensorboard_logging=False):
        """

        Parameters
        ----------
        model: HybridBlock
        gpu_idxs: None or list of ints
            If None will set context to CPU.
            If list of ints, will set context to given GPUs.
        """
        logging.info("Using Module Learner.")
        model.hybridize()
        logging.info("Hybridized model.")
        input = mx.sym.var('data')
        pre_output = model(input)
        output = mx.sym.SoftmaxOutput(pre_output, name='softmax')
        context = get_context(gpu_idxs)
        self.module = mx.mod.Module(symbol=output,
                                    context=context,
                                    data_names=['data'],
                                    label_names=['softmax_label'])
        self.tensorboard_logging = tensorboard_logging
        if self.tensorboard_logging:
            from mxboard import SummaryWriter
            current_folder = os.path.dirname(os.path.realpath(__file__))
            tensorboard_folder = os.path.join(current_folder, "..", "logs",
                                              "tensorboard")
            summary_filepath = os.path.join(tensorboard_folder, run_id)
            self.writer = SummaryWriter(logdir=summary_filepath)

    def fit(self,
            train_data,
            valid_data,
            epochs=300,
            lr=None,
            lr_schedule=None,
            initializer=mx.init.Xavier(),
            optimizer=None,
            kvstore='device',
            log_frequency=10000,
            early_stopping_criteria=None):
        """
        Uses accuracy as training and validation metric.

        Parameters
        ----------
        train_iter : DataIter
            Contains training data
        validation_iter : DataIter
            Contains validation data
        epochs: int
            Number of epochs to run, unless stopped early by early_stopping_criteria.
        lr: float or int
            Learning rate
        lr_schedule : dict
            Contains change points of learning rate.
            Key is the epoch and value is the learning rate.
            Must contain epoch 0.
        initializer : mxnet.initializer.Initializer
        optimizer: mxnet.optimizer.Optimizer
            Defaults to be `mx.optimizer.SGD(learning_rate=lr_schedule[0], rescale_grad=1.0/batch_size, momentum=0.9)`
        kvstore : str, optional
        log_frequency : int, optional
            Number of batches between logs
        early_stopping_criteria: function (float -> boolean)
            Given validation accuracy, should return True if training should be stopped early.

        Returns
        -------

        None

        Output is logged to file.

        """

        if lr_schedule is None:
            assert lr is not None, "lr must be defined if not using lr_schedule"
            lr_schedule = {0: lr}
        else:
            assert lr is None, "lr should not be defined if using lr_schedule"
            assert 0 in lr_schedule.keys(
            ), "lr for epoch 0 must be defined in lr_schedule"

        mod = self.module
        batch_size = train_data.provide_data[0].shape[0]
        mod.bind(data_shapes=train_data.provide_data,
                 label_shapes=train_data.provide_label)
        mod.init_params(initializer=initializer)
        if optimizer is None:
            optimizer = mx.optimizer.SGD(learning_rate=lr_schedule[0],
                                         rescale_grad=1.0 / batch_size,
                                         momentum=0.9)
        mod.init_optimizer(kvstore=kvstore, optimizer=optimizer)
        train_metric = mx.metric.create('acc')
        validation_metric = mx.metric.create('acc')
        max_val_acc = {'val_acc': 0, 'trn_acc': 0, 'epoch': 0}

        for epoch in range(epochs):
            epoch_tick = time.time()

            # update learning rate
            if epoch in lr_schedule.keys():
                mod._optimizer.lr = lr_schedule[epoch]
                logging.info("Epoch {}, Changed learning rate.".format(epoch))
            logging.info('Epoch {}, Learning rate={}'.format(
                epoch, mod._optimizer.lr))
            if self.tensorboard_logging:
                self.writer.add_scalar(tag='learning_rate',
                                       value=mod._optimizer.lr,
                                       global_step=epoch + 1)

            train_data.reset()
            train_metric.reset()
            samples_processed = 0
            for batch_idx, batch in enumerate(train_data):
                batch_tick = time.time()
                mod.forward(batch, is_train=True)  # compute predictions
                mod.update_metric(
                    train_metric,
                    batch.label)  # accumulate prediction accuracy
                mod.backward()  # compute gradients
                mod.update()  # update parameters

                if self.tensorboard_logging:
                    # log to tensorboard (on first batch)
                    if batch_idx == 0:
                        self.writer.add_image(tag="batch",
                                              image=batch.data[0],
                                              global_step=epoch + 1)

                # log batch speed (if a multiple of log_frequency is contained in the last batch)
                log_batch = (samples_processed // log_frequency) != (
                    (samples_processed + batch_size) // log_frequency)
                if ((batch_idx >= 1) and log_batch):
                    # batch estimate, not averaged over multiple batches
                    speed = batch_size / (time.time() - batch_tick)
                    logging.info(
                        'Epoch {}, Batch {}, Speed={:.2f} images/second'.
                        format(epoch, batch_idx, speed))
                samples_processed += batch_size

            # log training accuracy
            _, trn_acc = train_metric.get()
            logging.info('Epoch {}, Training accuracy={}'.format(
                epoch, trn_acc))
            if self.tensorboard_logging:
                self.writer.add_scalar(tag='accuracy/training',
                                       value=trn_acc * 100,
                                       global_step=epoch + 1)

            # log validation accuracy
            res = mod.score(valid_data, validation_metric)
            _, val_acc = res[0]
            logging.info('Epoch {}, Validation accuracy={}'.format(
                epoch, val_acc))
            if self.tensorboard_logging:
                self.writer.add_scalar(tag='accuracy/validation',
                                       value=val_acc * 100,
                                       global_step=epoch + 1)
            # log maximum validation accuracy
            if val_acc > max_val_acc['val_acc']:
                max_val_acc = {
                    'val_acc': val_acc,
                    'trn_acc': trn_acc,
                    'epoch': epoch
                }
            logging.info(("Epoch {}, Max validation accuracy={} @ "
                          "Epoch {} (with training accuracy={})").format(
                              epoch, max_val_acc['val_acc'],
                              max_val_acc['epoch'], max_val_acc['trn_acc']))

            # log duration of epoch
            logging.info('Epoch {}, Duration={}'.format(
                epoch,
                time.time() - epoch_tick))

            if early_stopping_criteria:
                if early_stopping_criteria(val_acc):
                    logging.info(
                        "Epoch {}, Reached early stopping target, stopping training."
                        .format(epoch))
                    break

    def predict(self, test_data, log_frequency=10000):
        logging.info('Starting inference.')

        mod = self.module
        batch_size = test_data.provide_data[0].shape[0]
        mod.bind(data_shapes=test_data.provide_data,
                 label_shapes=test_data.provide_label)

        samples_processed = 0
        batch_tick = time.time()
        for pred, batch_idx, batch in mod.iter_predict(test_data):
            pred[0].wait_to_read()
            batch_tock = time.time()
            # log batch speed (if a multiple of log_frequency is contained in the last batch)
            log_batch = (samples_processed // log_frequency) != (
                (samples_processed + batch_size) // log_frequency)
            warm_up_period = 5
            if ((batch_idx >= warm_up_period) and log_batch):
                # batch estimate, not averaged over multiple batches
                latency = (batch_tock - batch_tick)  # seconds
                speed = batch_size / latency
                logging.info(
                    'Inference. Batch {}, Latency={:.5f} ms, Speed={:.2f} images/second'
                    .format(batch_idx, latency * 1000, speed))
            samples_processed += batch_size
            batch_tick = time.time()

        logging.info('Completed inference.')
Ejemplo n.º 16
0
    # train model
    global_step = 1
    for epoch in range(1, epochs + 1):
        
        for train_w, train_d, train_r, train_t in train_loader:
            
            start_time = time()
            
            with autograd.record():
                output = net([train_w, train_d, train_r])
                l = loss_function(output, train_t)
            l.backward()
            trainer.step(train_t.shape[0])
            training_loss = l.mean().asscalar()

            sw.add_scalar(tag = 'training_loss', value = training_loss, global_step = global_step)
            
            print('global step: %s, training loss: %.2f, time: %.2fs'\
                %(global_step, training_loss, time() - start_time))
            global_step += 1

        # logging the gradients of parameters for checking convergence
        for name, param in net.collect_params().items():
            try:
                sw.add_histogram(tag = name + "_grad", values = param.grad(), global_step = global_step, bins = 1000)
            except:
                print(name)
                print(param.grad())

        # compute validation loss
        compute_val_loss(net, val_loader, loss_function, sw, epoch)
Ejemplo n.º 17
0
class FIFOScheduler(TaskScheduler):
    r"""Simple scheduler that just runs trials in submission order.

    Parameters
    ----------
    train_fn : callable
        A task launch function for training. Note: please add the `@autogluon_method` decorater to the original function.
    args : object (optional)
        Default arguments for launching train_fn.
    resource : dict
        Computation resources. For example, `{'num_cpus':2, 'num_gpus':1}`
    searcher : str or object
        Autogluon searcher. For example, autogluon.searcher.self.argsRandomSampling
    time_attr : str
            A training result attr to use for comparing time.
            Note that you can pass in something non-temporal such as
            `training_epoch` as a measure of progress, the only requirement
            is that the attribute should increase monotonically.
    reward_attr : str
        The training result objective value attribute. As with `time_attr`, this may refer to any objective value.
        Stopping procedures will use this attribute.
    dist_ip_addrs : list of str
        IP addresses of remote machines.

    Examples
    --------
    >>> import numpy as np
    >>> import autogluon as ag
    >>> @ag.args(
    ...     lr=ag.space.Real(1e-3, 1e-2, log=True),
    ...     wd=ag.space.Real(1e-3, 1e-2))
    >>> def train_fn(args, reporter):
    ...     print('lr: {}, wd: {}'.format(args.lr, args.wd))
    ...     for e in range(10):
    ...         dummy_accuracy = 1 - np.power(1.8, -np.random.uniform(e, 2*e))
    ...         reporter(epoch=e, accuracy=dummy_accuracy, lr=args.lr, wd=args.wd)
    >>> scheduler = ag.scheduler.FIFOScheduler(train_fn,
    ...                                        resource={'num_cpus': 2, 'num_gpus': 0},
    ...                                        num_trials=20,
    ...                                        reward_attr='accuracy',
    ...                                        time_attr='epoch')
    >>> scheduler.run()
    >>> scheduler.join_jobs()
    >>> scheduler.get_training_curves(plot=True)
    """
    def __init__(self,
                 train_fn,
                 args=None,
                 resource=None,
                 searcher=None,
                 search_options=None,
                 checkpoint='./exp/checkpoint.ag',
                 resume=False,
                 num_trials=None,
                 time_out=None,
                 max_reward=1.0,
                 time_attr='epoch',
                 reward_attr='accuracy',
                 visualizer='none',
                 dist_ip_addrs=None):
        super().__init__(dist_ip_addrs)

        if resource is None:
            resource = {'num_cpus': 1, 'num_gpus': 0}
        self.resource = resource

        if searcher is None:
            searcher = 'random'  # Default: Random searcher

        if isinstance(searcher, str):
            kwargs = search_options.copy() if search_options else dict()
            kwargs['configspace'] = train_fn.cs
            self.searcher: BaseSearcher = searcher_factory(searcher, **kwargs)
        else:
            assert isinstance(searcher, BaseSearcher)
            self.searcher: BaseSearcher = searcher

        assert isinstance(train_fn, _autogluon_method)
        self.train_fn = train_fn
        self.args = args if args else train_fn.args

        # meta data
        self.metadata = {
            'search_space': train_fn.kwspaces,
            'search_strategy': searcher,
            'stop_criterion': {
                'time_limits': time_out,
                'max_reward': max_reward
            },
            'resources_per_trial': resource
        }

        self.num_trials = num_trials
        self.time_out = time_out
        self.max_reward = max_reward
        self._checkpoint = checkpoint
        self._time_attr = time_attr
        self._reward_attr = reward_attr
        self.visualizer = visualizer.lower()
        if self.visualizer == 'tensorboard' or self.visualizer == 'mxboard':
            try_import_mxboard()
            from mxboard import SummaryWriter
            self.mxboard = SummaryWriter(logdir=os.path.join(
                os.path.splitext(checkpoint)[0], 'logs'),
                                         flush_secs=3,
                                         verbose=False)
        self.log_lock = mp.Lock()
        self.training_history = OrderedDict()
        self.config_history = OrderedDict()

        if resume:
            if os.path.isfile(checkpoint):
                self.load_state_dict(load(checkpoint))
            else:
                msg = f'checkpoint path {checkpoint} is not available for resume.'
                logger.exception(msg)
                raise FileExistsError(msg)

    def run(self, **kwargs):
        """Run multiple number of trials
        """
        start_time = time.time()
        num_trials = kwargs.get('num_trials', self.num_trials)
        time_out = kwargs.get('time_out', self.time_out)
        logger.info('Starting Experiments')
        logger.info(f'Num of Finished Tasks is {self.num_finished_tasks}')
        logger.info(
            f'Num of Pending Tasks is {num_trials - self.num_finished_tasks}')
        tbar = tqdm(range(self.num_finished_tasks, num_trials))
        for _ in tbar:
            if time_out and time.time() - start_time >= time_out \
                    or self.max_reward and self.get_best_reward() >= self.max_reward:
                break
            self.schedule_next()

    def save(self, checkpoint=None):
        """Save Checkpoint
        """
        if checkpoint is None:
            if self._checkpoint is None:
                logger.warning("Checkpointing is disabled")
            else:
                checkpoint = self._checkpoint
        if checkpoint is not None:
            with self.LOCK:
                mkdir(os.path.dirname(checkpoint))
                save(self.state_dict(), checkpoint)

    def schedule_next(self):
        """Schedule next searcher suggested task
        """
        # Allow for the promotion of a previously chosen config. Also,
        # extra_kwargs contains extra info passed to both add_job and to
        # get_config (if no config is promoted)
        config, extra_kwargs = self._promote_config()
        if config is None:
            # No config to promote: Query next config to evaluate from searcher
            config = self.searcher.get_config(**extra_kwargs)
            extra_kwargs['new_config'] = True
        else:
            # This is not a new config, but a paused one which is now promoted
            extra_kwargs['new_config'] = False
        task = Task(self.train_fn, {
            'args': self.args,
            'config': config
        }, DistributedResource(**self.resource))
        self.add_job(task, **extra_kwargs)

    def run_with_config(self, config):
        """Run with config for final fit.
        It launches a single training trial under any fixed values of the hyperparameters.
        For example, after HPO has identified the best hyperparameter values based on a hold-out dataset,
        one can use this function to retrain a model with the same hyperparameters on all the available labeled data
        (including the hold out set). It can also returns other objects or states.
        """
        task = Task(self.train_fn, {
            'args': self.args,
            'config': config
        }, DistributedResource(**self.resource))
        reporter = FakeReporter()
        task.args['reporter'] = reporter
        return self.run_job(task)

    def _dict_from_task(self, task):
        if isinstance(task, Task):
            return {'TASK_ID': task.task_id, 'Config': task.args['config']}
        else:
            assert isinstance(task, dict)
            return {'TASK_ID': task['TASK_ID'], 'Config': task['Config']}

    def add_job(self, task, **kwargs):
        """Adding a training task to the scheduler.

        Args:
            task (:class:`autogluon.scheduler.Task`): a new training task

        Relevant entries in kwargs:
            - bracket: HB bracket to be used. Has been sampled in _promote_config
            - new_config: If True, task starts new config eval, otherwise it promotes
              a config (only if type == 'promotion')

        Only if new_config == False:
            - config_key: Internal key for config
            - resume_from: config promoted from this milestone
            - milestone: config promoted to this milestone (next from resume_from)
        """
        cls = FIFOScheduler
        cls.RESOURCE_MANAGER._request(task.resources)
        # reporter
        reporter = DistStatusReporter(remote=task.resources.node)
        task.args['reporter'] = reporter
        # Register pending evaluation
        self.searcher.register_pending(task.args['config'])
        # main process
        job = cls._start_distributed_job(task, cls.RESOURCE_MANAGER)
        # reporter thread
        rp = threading.Thread(target=self._run_reporter,
                              args=(task, job, reporter, self.searcher),
                              daemon=False)
        rp.start()
        task_dict = self._dict_from_task(task)
        task_dict.update({'Task': task, 'Job': job, 'ReporterThread': rp})
        # checkpoint thread
        if self._checkpoint is not None:

            def _save_checkpoint_callback(fut):
                self._cleaning_tasks()
                self.save()

            job.add_done_callback(_save_checkpoint_callback)

        with self.LOCK:
            self.scheduled_tasks.append(task_dict)

    def _clean_task_internal(self, task_dict):
        task_dict['ReporterThread'].join()

    def _run_reporter(self, task, task_job, reporter, searcher):
        last_result = None
        while not task_job.done():
            reported_result = reporter.fetch()
            if 'traceback' in reported_result:
                logger.exception(reported_result['traceback'])
                break

            if reported_result.get('done', False):
                reporter.move_on()
                break
            self._add_training_result(task.task_id,
                                      reported_result,
                                      config=task.args['config'])
            reporter.move_on()
            last_result = reported_result
        if last_result is not None:
            last_result['done'] = True
            searcher.update(config=task.args['config'],
                            reward=last_result[self._reward_attr],
                            **last_result)

    def _promote_config(self):
        """
        Provides a hook in schedule_next, which allows to promote a config
        which has been selected and partially evaluated previously.

        :return: config, extra_args
        """
        config = None
        extra_args = dict()
        return config, extra_args

    def get_best_config(self):
        """Get the best configuration from the finished jobs.
        """
        return self.searcher.get_best_config()

    def get_best_reward(self):
        """Get the best reward from the finished jobs.
        """
        return self.searcher.get_best_reward()

    def _add_training_result(self, task_id, reported_result, config=None):
        if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard':
            if 'loss' in reported_result:
                self.mxboard.add_scalar(
                    tag='loss',
                    value=(f'task {task_id} valid_loss',
                           reported_result['loss']),
                    global_step=reported_result[self._reward_attr])
            self.mxboard.add_scalar(
                tag=self._reward_attr,
                value=(f'task {task_id} {self._reward_attr}',
                       reported_result[self._reward_attr]),
                global_step=reported_result[self._reward_attr])
        with self.log_lock:
            # Note: We store all of reported_result in training_history[task_id],
            # not just the reward value.
            if task_id in self.training_history:
                self.training_history[task_id].append(reported_result)
            else:
                self.training_history[task_id] = [reported_result]
                if config:
                    self.config_history[task_id] = config

    def get_training_curves(self, filename=None, plot=False, use_legend=True):
        """Get Training Curves

        Parameters
        ----------
            filename : str
            plot : bool
            use_legend : bool

        Examples
        --------
        >>> scheduler.run()
        >>> scheduler.join_jobs()
        >>> scheduler.get_training_curves(plot=True)

            .. image:: https://github.com/zhanghang1989/AutoGluonWebdata/blob/master/doc/api/autogluon.1.png?raw=true
        """
        if filename is None and not plot:
            logger.warning(
                'Please either provide filename or allow plot in get_training_curves'
            )
        import matplotlib.pyplot as plt
        plt.ylabel(self._reward_attr)
        plt.xlabel(self._time_attr)
        plt.title("Performance vs Training-Time in each HPO Trial")
        with self.log_lock:
            for task_id, task_res in self.training_history.items():
                rewards = [x[self._reward_attr] for x in task_res]
                x = list(range(len(task_res)))
                plt.plot(x, rewards, label=f'task {task_id}')
        if use_legend:
            plt.legend(loc='best')
        if filename:
            logger.info(f'Saving Training Curve in {filename}')
            plt.savefig(filename)
        if plot:
            plt.show()

    def state_dict(self, destination=None):
        """Returns a dictionary containing a whole state of the Scheduler

        Examples
        --------
        >>> ag.save(scheduler.state_dict(), 'checkpoint.ag')
        """
        destination = super().state_dict(destination)
        destination['searcher'] = pickle.dumps(self.searcher)
        with self.log_lock:
            destination['training_history'] = json.dumps(self.training_history)
        if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard':
            destination['visualizer'] = json.dumps(self.mxboard._scalar_dict)
        return destination

    def load_state_dict(self, state_dict):
        """Load from the saved state dict.

        Examples
        --------
        >>> scheduler.load_state_dict(ag.load('checkpoint.ag'))
        """
        super().load_state_dict(state_dict)
        self.searcher = pickle.loads(state_dict['searcher'])
        with self.log_lock:
            self.training_history = json.loads(state_dict['training_history'])
        if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard':
            self.mxboard._scalar_dict = json.loads(state_dict['visualizer'])
        logger.debug(f'Loading Searcher State {self.searcher}')
Ejemplo n.º 18
0
class Trainer(object):
    def __init__(self, args):
        self.args = args
        self.sw = SummaryWriter(logdir='logs', flush_secs=5)
        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])
        # dataset and dataloader
        data_kwargs = {'transform': input_transform, 'base_size': args.base_size,
                       'crop_size': args.crop_size}
        trainset = COCOSemantic(split='train', mode='train', **data_kwargs)
        valset = COCOSemantic(split='val', mode='val', **data_kwargs)
        self.train_data = gluon.data.DataLoader(
            trainset, args.batch_size, shuffle=True, last_batch='rollover',
            num_workers=args.workers)
        self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size,
                                               last_batch='rollover', num_workers=args.workers)
        model = model_zoo.PSPNet(nclass=trainset.NUM_CLASS, backbone='resnet50', aux=True, pretrained_base=True)
        model.cast(args.dtype)
        self.net = DataParallelModel(model, args.ctx, args.syncbn)
        self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx)
        # resume checkpoint if needed
        if args.resume is not None:
            if os.path.isfile(args.resume):
                model.load_parameters(args.resume, ctx=args.ctx)
            else:
                raise RuntimeError("=> no checkpoint found at '{}'" \
                                   .format(args.resume))
        # create criterion
        criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight)
        self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn)
        # optimizer and lr scheduling
        self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr,
                                        niters=len(self.train_data),
                                        nepochs=args.epochs)
        kv = mx.kv.create(args.kvstore)
        optimizer_params = {'lr_scheduler': self.lr_scheduler,
                            'wd': args.weight_decay,
                            'momentum': args.momentum}
        if args.dtype == 'float16':
            optimizer_params['multi_precision'] = True

        if args.no_wd:
            for k, v in self.net.module.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd',
                                       optimizer_params, kvstore=kv)
        # evaluation metrics
        self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class)

    def training(self, epoch):
        tbar = tqdm(self.train_data)
        train_loss = 0.0
        alpha = 0.2
        for i, (data, target) in enumerate(tbar):
            self.lr_scheduler.update(i, epoch)
            with autograd.record(True):
                outputs = self.net(data.astype(args.dtype, copy=False))
                losses = self.criterion(outputs, target)
                mx.nd.waitall()
                autograd.backward(losses)
            self.optimizer.step(self.args.batch_size)
            for loss in losses:
                train_loss += loss.asnumpy()[0] / len(losses)
                self.sw.add_scalar(tag='Training MixSoftmaxCrossEntropyLoss', value=train_loss, global_step=epoch)
            tbar.set_description('Epoch %d, training loss %.3f' % \
                                 (epoch, train_loss / (i + 1)))
            mx.nd.waitall()

        # save every epoch
        save_checkpoint(self.net.module, self.args, False)

    def validation(self, epoch):
        # total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
        self.metric.reset()
        tbar = tqdm(self.eval_data)
        for i, (data, target) in enumerate(tbar):
            outputs = self.evaluator(data.astype(args.dtype, copy=False))
            outputs = [x[0] for x in outputs]
            targets = mx.gluon.utils.split_and_load(target, args.ctx, even_split=False)
            self.metric.update(targets, outputs)
            pixAcc, mIoU = self.metric.get()
            tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f' % \
                                 (epoch, pixAcc, mIoU))
            self.sw.add_scalar(tag='Pixel Accuray (on valset)', value=pixAcc, global_step=epoch)
            self.sw.add_scalar(tag='mIoU (on valset)', value=mIoU, global_step=epoch)
            mx.nd.waitall()
Ejemplo n.º 19
0
class TabularNeuralNetModel(AbstractNeuralNetworkModel):
    """ Class for neural network models that operate on tabular data.
        These networks use different types of input layers to process different types of data in various columns.

        Attributes:
            _types_of_features (dict): keys = 'continuous', 'skewed', 'onehot', 'embed', 'language'; values = column-names of Dataframe corresponding to the features of this type
            feature_arraycol_map (OrderedDict): maps feature-name -> list of column-indices in df corresponding to this feature
        self.feature_type_map (OrderedDict): maps feature-name -> feature_type string (options: 'vector', 'embed', 'language')
        processor (sklearn.ColumnTransformer): scikit-learn preprocessor object.

        Note: This model always assumes higher values of self.eval_metric indicate better performance.

    """

    # Constants used throughout this class:
    # model_internals_file_name = 'model-internals.pkl' # store model internals here
    unique_category_str = '!missing!' # string used to represent missing values and unknown categories for categorical features. Should not appear in the dataset
    rescale_losses = {gluon.loss.L1Loss:'std', gluon.loss.HuberLoss:'std', gluon.loss.L2Loss:'var'} # dict of loss names where we should rescale loss, value indicates how to rescale. Call self.loss_func.name
    params_file_name = 'net.params' # Stores parameters of final network
    temp_file_name = 'temp_net.params' # Stores temporary network parameters (eg. during the course of training)

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        """
        TabularNeuralNetModel object.

        Parameters
        ----------
        path (str): file-path to directory where to save files associated with this model
        name (str): name used to refer to this model
        problem_type (str): what type of prediction problem is this model used for
        eval_metric (func): function used to evaluate performance (Note: we assume higher = better)
        hyperparameters (dict): various hyperparameters for neural network and the NN-specific data processing
        features (list): List of predictive features to use, other features are ignored by the model.
        """
        self.feature_arraycol_map = None
        self.feature_type_map = None
        self.features_to_drop = []  # may change between different bagging folds. TODO: consider just removing these from self.features if it works with bagging
        self.processor = None  # data processor
        self.summary_writer = None
        self.ctx = mx.cpu()
        self.batch_size = None
        self.num_dataloading_workers = None
        self.num_dataloading_workers_inference = 0
        self.params_post_fit = None
        self.num_net_outputs = None
        self._architecture_desc = None
        self.optimizer = None
        self.verbosity = None
        if self.stopping_metric is not None and self.eval_metric == roc_auc and self.stopping_metric == log_loss:
            self.stopping_metric = roc_auc  # NN is overconfident so early stopping with logloss can halt training too quick

        self.eval_metric_name = self.stopping_metric.name

    def _set_default_params(self):
        """ Specifies hyperparameter values to use by default """
        default_params = get_default_param(self.problem_type)
        for param, val in default_params.items():
            self._set_default_param_value(param, val)

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(
            ignored_type_group_raw=[R_OBJECT],
            ignored_type_group_special=[S_TEXT_NGRAM, S_TEXT_AS_CATEGORY],
        )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    def _get_default_searchspace(self):
        return get_default_searchspace(self.problem_type, num_classes=None)

    def set_net_defaults(self, train_dataset, params):
        """ Sets dataset-adaptive default values to use for our neural network """
        if (self.problem_type == MULTICLASS) or (self.problem_type == SOFTCLASS):
            self.num_net_outputs = train_dataset.num_classes
        elif self.problem_type == REGRESSION:
            self.num_net_outputs = 1
            if params['y_range'] is None:  # Infer default y-range
                y_vals = train_dataset.dataset._data[train_dataset.label_index].asnumpy()
                min_y = float(min(y_vals))
                max_y = float(max(y_vals))
                std_y = np.std(y_vals)
                y_ext = params['y_range_extend'] * std_y
                if min_y >= 0:  # infer y must be nonnegative
                    min_y = max(0, min_y-y_ext)
                else:
                    min_y = min_y-y_ext
                if max_y <= 0:  # infer y must be non-positive
                    max_y = min(0, max_y+y_ext)
                else:
                    max_y = max_y+y_ext
                params['y_range'] = (min_y, max_y)
        elif self.problem_type == BINARY:
            self.num_net_outputs = 2
        else:
            raise ValueError("unknown problem_type specified: %s" % self.problem_type)

        if params['layers'] is None:  # Use default choices for MLP architecture
            if self.problem_type == REGRESSION:
                default_layer_sizes = [256, 128]  # overall network will have 4 layers. Input layer, 256-unit hidden layer, 128-unit hidden layer, output layer.
            else:
                default_sizes = [256, 128]  # will be scaled adaptively
                # base_size = max(1, min(self.num_net_outputs, 20)/2.0) # scale layer width based on number of classes
                base_size = max(1, min(self.num_net_outputs, 100) / 50)  # TODO: Updated because it improved model quality and made training far faster
                default_layer_sizes = [defaultsize*base_size for defaultsize in default_sizes]
            layer_expansion_factor = 1  # TODO: consider scaling based on num_rows, eg: layer_expansion_factor = 2-np.exp(-max(0,train_dataset.num_examples-10000))

            max_layer_width = params['max_layer_width']
            params['layers'] = [int(min(max_layer_width, layer_expansion_factor*defaultsize)) for defaultsize in default_layer_sizes]

        if train_dataset.has_vector_features() and params['numeric_embed_dim'] is None:  # Use default choices for numeric embedding size
            vector_dim = train_dataset.dataset._data[train_dataset.vectordata_index].shape[1]  # total dimensionality of vector features
            prop_vector_features = train_dataset.num_vector_features() / float(train_dataset.num_features)  # Fraction of features that are numeric
            min_numeric_embed_dim = 32
            max_numeric_embed_dim = params['max_layer_width']
            params['numeric_embed_dim'] = int(min(max_numeric_embed_dim, max(min_numeric_embed_dim,
                                                    params['layers'][0]*prop_vector_features*np.log10(vector_dim+10) )))
        return

    def _fit(self, X_train, y_train, X_val=None, y_val=None, time_limit=None, reporter=None, **kwargs):
        """ X_train (pd.DataFrame): training data features (not necessarily preprocessed yet)
            X_val (pd.DataFrame): test data features (should have same column names as Xtrain)
            y_train (pd.Series):
            y_val (pd.Series): are pandas Series
            kwargs: Can specify amount of compute resources to utilize (num_cpus, num_gpus).
        """
        start_time = time.time()
        params = self.params.copy()
        self.verbosity = kwargs.get('verbosity', 2)
        params = fixedvals_from_searchspaces(params)
        if self.feature_metadata is None:
            raise ValueError("Trainer class must set feature_metadata for this model")
        if 'num_cpus' in kwargs:
            self.num_dataloading_workers = max(1, int(kwargs['num_cpus']/2.0))
        else:
            self.num_dataloading_workers = 1
        if self.num_dataloading_workers == 1:
            self.num_dataloading_workers = 0  # 0 is always faster and uses less memory than 1
        self.batch_size = params['batch_size']
        train_dataset, val_dataset = self.generate_datasets(X_train=X_train, y_train=y_train, params=params, X_val=X_val, y_val=y_val)
        logger.log(15, "Training data for neural network has: %d examples, %d features (%d vector, %d embedding, %d language)" %
              (train_dataset.num_examples, train_dataset.num_features,
               len(train_dataset.feature_groups['vector']), len(train_dataset.feature_groups['embed']),
               len(train_dataset.feature_groups['language']) ))
        # self._save_preprocessor()  # TODO: should save these things for hyperparam tunning. Need one HP tuner for network-specific HPs, another for preprocessing HPs.

        if 'num_gpus' in kwargs and kwargs['num_gpus'] >= 1:
            self.ctx = mx.gpu()  # Currently cannot use more than 1 GPU
        else:
            self.ctx = mx.cpu()
        self.get_net(train_dataset, params=params)

        if time_limit:
            time_elapsed = time.time() - start_time
            time_limit = time_limit - time_elapsed

        self.train_net(train_dataset=train_dataset, params=params, val_dataset=val_dataset, initialize=True, setup_trainer=True, time_limit=time_limit, reporter=reporter)
        self.params_post_fit = params
        """
        # TODO: if we don't want to save intermediate network parameters, need to do something like saving in temp directory to clean up after training:
        with make_temp_directory() as temp_dir:
            save_callback = SaveModelCallback(self.model, monitor=self.metric, mode=save_callback_mode, name=self.name)
            with progress_disabled_ctx(self.model) as model:
                original_path = model.path
                model.path = Path(temp_dir)
                model.fit_one_cycle(self.epochs, self.lr, callbacks=save_callback)

                # Load the best one and export it
                model.load(self.name)
                print(f'Model validation metrics: {model.validate()}')
                model.path = original_path
        """

    def get_net(self, train_dataset, params):
        """ Creates a Gluon neural net and context for this dataset.
            Also sets up trainer/optimizer as necessary.
        """
        self.set_net_defaults(train_dataset, params)
        self.model = EmbedNet(train_dataset=train_dataset, params=params, num_net_outputs=self.num_net_outputs, ctx=self.ctx)

        # TODO: Below should not occur until at time of saving
        if not os.path.exists(self.path):
            os.makedirs(self.path)

    def train_net(self, train_dataset, params, val_dataset=None, initialize=True, setup_trainer=True, time_limit=None, reporter=None):
        """ Trains neural net on given train dataset, early stops based on test_dataset.
            Args:
                train_dataset (TabularNNDataset): training data used to learn network weights
                val_dataset (TabularNNDataset): validation data used for hyperparameter tuning
                initialize (bool): set = False to continue training of a previously trained model, otherwise initializes network weights randomly
                setup_trainer (bool): set = False to reuse the same trainer from a previous training run, otherwise creates new trainer from scratch
        """
        start_time = time.time()
        logger.log(15, "Training neural network for up to %s epochs..." % params['num_epochs'])
        seed_value = params.get('seed_value')
        if seed_value is not None:  # Set seed
            random.seed(seed_value)
            np.random.seed(seed_value)
            mx.random.seed(seed_value)
        if initialize:  # Initialize the weights of network
            logging.debug("initializing neural network...")
            self.model.collect_params().initialize(ctx=self.ctx)
            self.model.hybridize()
            logging.debug("initialized")
        if setup_trainer:
            # Also setup mxboard to monitor training if visualizer has been specified:
            visualizer = params.get('visualizer', 'none')
            if visualizer == 'tensorboard' or visualizer == 'mxboard':
                try_import_mxboard()
                from mxboard import SummaryWriter
                self.summary_writer = SummaryWriter(logdir=self.path, flush_secs=5, verbose=False)
            self.optimizer = self.setup_trainer(params=params, train_dataset=train_dataset)
        best_val_metric = -np.inf  # higher = better
        val_metric = None
        best_val_epoch = 0
        val_improve_epoch = 0  # most recent epoch where validation-score strictly improved
        num_epochs = params['num_epochs']
        if val_dataset is not None:
            y_val = val_dataset.get_labels()
        else:
            y_val = None

        if params['loss_function'] is None:
            if self.problem_type == REGRESSION:
                params['loss_function'] = gluon.loss.L1Loss()
            elif self.problem_type == SOFTCLASS:
                params['loss_function'] = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False, from_logits=self.model.from_logits)
            else:
                params['loss_function'] = gluon.loss.SoftmaxCrossEntropyLoss(from_logits=self.model.from_logits)

        loss_func = params['loss_function']
        epochs_wo_improve = params['epochs_wo_improve']
        loss_scaling_factor = 1.0  # we divide loss by this quantity to stabilize gradients
        loss_torescale = [key for key in self.rescale_losses if isinstance(loss_func, key)]
        if loss_torescale:
            loss_torescale = loss_torescale[0]
            if self.rescale_losses[loss_torescale] == 'std':
                loss_scaling_factor = np.std(train_dataset.get_labels())/5.0 + EPS  # std-dev of labels
            elif self.rescale_losses[loss_torescale] == 'var':
                loss_scaling_factor = np.var(train_dataset.get_labels())/5.0 + EPS  # variance of labels
            else:
                raise ValueError("Unknown loss-rescaling type %s specified for loss_func==%s" % (self.rescale_losses[loss_torescale], loss_func))

        if self.verbosity <= 1:
            verbose_eval = -1  # Print losses every verbose epochs, Never if -1
        elif self.verbosity == 2:
            verbose_eval = 50
        elif self.verbosity == 3:
            verbose_eval = 10
        else:
            verbose_eval = 1

        net_filename = self.path + self.temp_file_name
        if num_epochs == 0:  # use dummy training loop that stops immediately (useful for using NN just for data preprocessing / debugging)
            logger.log(20, "Not training Neural Net since num_epochs == 0.  Neural network architecture is:")
            for batch_idx, data_batch in enumerate(train_dataset.dataloader):
                data_batch = train_dataset.format_batch_data(data_batch, self.ctx)
                with autograd.record():
                    output = self.model(data_batch)
                    labels = data_batch['label']
                    loss = loss_func(output, labels) / loss_scaling_factor
                    # print(str(nd.mean(loss).asscalar()), end="\r")  # prints per-batch losses
                loss.backward()
                self.optimizer.step(labels.shape[0])
                if batch_idx > 0:
                    break
            self.model.save_parameters(net_filename)
            logger.log(15, "untrained Neural Net saved to file")
            return

        # Training Loop:
        for e in range(num_epochs):
            if e == 0:  # special actions during first epoch:
                logger.log(15, "Neural network architecture:")
                logger.log(15, str(self.model))
            cumulative_loss = 0
            for batch_idx, data_batch in enumerate(train_dataset.dataloader):
                data_batch = train_dataset.format_batch_data(data_batch, self.ctx)
                with autograd.record():
                    output = self.model(data_batch)
                    labels = data_batch['label']
                    loss = loss_func(output, labels) / loss_scaling_factor
                    # print(str(nd.mean(loss).asscalar()), end="\r")  # prints per-batch losses
                loss.backward()
                self.optimizer.step(labels.shape[0])
                cumulative_loss += loss.sum()
            train_loss = cumulative_loss/float(train_dataset.num_examples)  # training loss this epoch
            if val_dataset is not None:
                val_metric = self.score(X=val_dataset, y=y_val, eval_metric=self.stopping_metric, metric_needs_y_pred=self.stopping_metric_needs_y_pred)
            if (val_dataset is None) or (val_metric >= best_val_metric) or (e == 0):  # keep training if score has improved
                if val_dataset is not None:
                    if not np.isnan(val_metric):
                        if val_metric > best_val_metric:
                            val_improve_epoch = e
                        best_val_metric = val_metric
                best_val_epoch = e
                # Until functionality is added to restart training from a particular epoch, there is no point in saving params without test_dataset
                if val_dataset is not None:
                    self.model.save_parameters(net_filename)
            if val_dataset is not None:
                if verbose_eval > 0 and e % verbose_eval == 0:
                    logger.log(15, "Epoch %s.  Train loss: %s, Val %s: %s" %
                      (e, train_loss.asscalar(), self.eval_metric_name, val_metric))
                if self.summary_writer is not None:
                    self.summary_writer.add_scalar(tag='val_'+self.eval_metric_name,
                                                   value=val_metric, global_step=e)
            else:
                if verbose_eval > 0 and e % verbose_eval == 0:
                    logger.log(15, "Epoch %s.  Train loss: %s" % (e, train_loss.asscalar()))
            if self.summary_writer is not None:
                self.summary_writer.add_scalar(tag='train_loss', value=train_loss.asscalar(), global_step=e)  # TODO: do we want to keep mxboard support?
            if reporter is not None:
                # TODO: Ensure reporter/scheduler properly handle None/nan values after refactor
                if val_dataset is not None and (not np.isnan(val_metric)):  # TODO: This might work without the if statement
                    # epoch must be number of epochs done (starting at 1)
                    reporter(epoch=e+1, validation_performance=val_metric, train_loss=float(train_loss.asscalar()))  # Higher val_metric = better
            if e - val_improve_epoch > epochs_wo_improve:
                break  # early-stop if validation-score hasn't strictly improved in `epochs_wo_improve` consecutive epochs
            if time_limit:
                time_elapsed = time.time() - start_time
                time_left = time_limit - time_elapsed
                if time_left <= 0:
                    logger.log(20, "\tRan out of time, stopping training early.")
                    break

        if val_dataset is not None:
            self.model.load_parameters(net_filename)  # Revert back to best model
            try:
                os.remove(net_filename)
            except FileNotFoundError:
                pass
        if val_dataset is None:
            logger.log(15, "Best model found in epoch %d" % best_val_epoch)
        else:  # evaluate one final time:
            final_val_metric = self.score(X=val_dataset, y=y_val, eval_metric=self.stopping_metric, metric_needs_y_pred=self.stopping_metric_needs_y_pred)
            if np.isnan(final_val_metric):
                final_val_metric = -np.inf
            logger.log(15, "Best model found in epoch %d. Val %s: %s" %
                  (best_val_epoch, self.eval_metric_name, final_val_metric))
        self.params_trained['num_epochs'] = best_val_epoch + 1
        return

    def _predict_proba(self, X, **kwargs):
        """ To align predict with abstract_model API.
            Preprocess here only refers to feature processing steps done by all AbstractModel objects,
            not tabularNN-specific preprocessing steps.
            If X is not DataFrame but instead TabularNNDataset object, we can still produce predictions,
            but cannot use preprocess in this case (needs to be already processed).
        """
        if isinstance(X, TabularNNDataset):
            return self._predict_tabular_data(new_data=X, process=False, predict_proba=True)
        elif isinstance(X, pd.DataFrame):
            X = self.preprocess(X, **kwargs)
            return self._predict_tabular_data(new_data=X, process=True, predict_proba=True)
        else:
            raise ValueError("X must be of type pd.DataFrame or TabularNNDataset, not type: %s" % type(X))

    def _predict_tabular_data(self, new_data, process=True, predict_proba=True):  # TODO ensure API lines up with tabular.Model class.
        """ Specific TabularNN method to produce predictions on new (unprocessed) data.
            Returns 1D numpy array unless predict_proba=True and task is multi-class classification (not binary).
            Args:
                new_data (pd.Dataframe or TabularNNDataset): new data to make predictions on.
                If you want to make prediction for just a single row of new_data, pass in: new_data.iloc[[row_index]]
                process (bool): should new data be processed (if False, new_data must be TabularNNDataset)
                predict_proba (bool): should we output class-probabilities (not used for regression)
        """
        if process:
            new_data = self.process_test_data(new_data, batch_size=self.batch_size, num_dataloading_workers=self.num_dataloading_workers_inference, labels=None)
        if not isinstance(new_data, TabularNNDataset):
            raise ValueError("new_data must of of type TabularNNDataset if process=False")
        if self.problem_type == REGRESSION or not predict_proba:
            preds = nd.zeros((new_data.num_examples,1))
        else:
            preds = nd.zeros((new_data.num_examples, self.num_net_outputs))
        i = 0
        for batch_idx, data_batch in enumerate(new_data.dataloader):
            data_batch = new_data.format_batch_data(data_batch, self.ctx)
            preds_batch = self.model(data_batch)
            batch_size = len(preds_batch)
            if self.problem_type != REGRESSION:
                if not predict_proba: # need to take argmax
                    preds_batch = nd.argmax(preds_batch, axis=1, keepdims=True)
                else: # need to take softmax
                    preds_batch = nd.softmax(preds_batch, axis=1)
            preds[i:(i+batch_size)] = preds_batch
            i = i+batch_size
        if self.problem_type == REGRESSION or not predict_proba:
            return preds.asnumpy().flatten()  # return 1D numpy array
        elif self.problem_type == BINARY and predict_proba:
            return preds[:,1].asnumpy()  # for binary problems, only return P(Y==+1)

        return preds.asnumpy()  # return 2D numpy array

    def generate_datasets(self, X_train, y_train, params, X_val=None, y_val=None):
        impute_strategy = params['proc.impute_strategy']
        max_category_levels = params['proc.max_category_levels']
        skew_threshold = params['proc.skew_threshold']
        embed_min_categories = params['proc.embed_min_categories']
        use_ngram_features = params['use_ngram_features']

        if isinstance(X_train, TabularNNDataset):
            train_dataset = X_train
        else:
            X_train = self.preprocess(X_train)
            if self.features is None:
                self.features = list(X_train.columns)
            train_dataset = self.process_train_data(
                df=X_train, labels=y_train, batch_size=self.batch_size, num_dataloading_workers=self.num_dataloading_workers,
                impute_strategy=impute_strategy, max_category_levels=max_category_levels, skew_threshold=skew_threshold, embed_min_categories=embed_min_categories, use_ngram_features=use_ngram_features,
            )
        if X_val is not None:
            if isinstance(X_val, TabularNNDataset):
                val_dataset = X_val
            else:
                X_val = self.preprocess(X_val)
                val_dataset = self.process_test_data(df=X_val, labels=y_val, batch_size=self.batch_size, num_dataloading_workers=self.num_dataloading_workers_inference)
        else:
            val_dataset = None
        return train_dataset, val_dataset

    def process_test_data(self, df, batch_size, num_dataloading_workers, labels=None):
        """ Process train or test DataFrame into a form fit for neural network models.
        Args:
            df (pd.DataFrame): Data to be processed (X)
            labels (pd.Series): labels to be processed (y)
            test (bool): Is this test data where each datapoint should be processed separately using predetermined preprocessing steps.
                         Otherwise preprocessor uses all data to determine propreties like best scaling factors, number of categories, etc.
        Returns:
            Dataset object
        """
        warnings.filterwarnings("ignore", module='sklearn.preprocessing') # sklearn processing n_quantiles warning
        if labels is not None and len(labels) != len(df):
            raise ValueError("Number of examples in Dataframe does not match number of labels")
        if (self.processor is None or self._types_of_features is None
           or self.feature_arraycol_map is None or self.feature_type_map is None):
            raise ValueError("Need to process training data before test data")
        if self.features_to_drop:
            drop_cols = [col for col in df.columns if col in self.features_to_drop]
            if drop_cols:
                df = df.drop(columns=drop_cols)

        df = self.processor.transform(df) # 2D numpy array. self.feature_arraycol_map, self.feature_type_map have been previously set while processing training data.
        return TabularNNDataset(df, self.feature_arraycol_map, self.feature_type_map,
                                batch_size=batch_size, num_dataloading_workers=num_dataloading_workers,
                                problem_type=self.problem_type, labels=labels, is_test=True)

    def process_train_data(self, df, batch_size, num_dataloading_workers, impute_strategy, max_category_levels, skew_threshold, embed_min_categories, use_ngram_features, labels):
        """ Preprocess training data and create self.processor object that can be used to process future data.
            This method should only be used once per TabularNeuralNetModel object, otherwise will produce Warning.

        # TODO no label processing for now
        # TODO: language features are ignored for now
        # TODO: add time/ngram features
        # TODO: no filtering of data-frame columns based on statistics, e.g. categorical columns with all unique variables or zero-variance features.
                This should be done in default_learner class for all models not just TabularNeuralNetModel...
        """
        warnings.filterwarnings("ignore", module='sklearn.preprocessing')  # sklearn processing n_quantiles warning
        if set(df.columns) != set(self.features):
            raise ValueError("Column names in provided Dataframe do not match self.features")
        if labels is None:
            raise ValueError("Attempting process training data without labels")
        if len(labels) != len(df):
            raise ValueError("Number of examples in Dataframe does not match number of labels")

        self._types_of_features, df = self._get_types_of_features(df, skew_threshold=skew_threshold, embed_min_categories=embed_min_categories, use_ngram_features=use_ngram_features)  # dict with keys: : 'continuous', 'skewed', 'onehot', 'embed', 'language', values = column-names of df
        logger.log(15, "AutoGluon Neural Network infers features are of the following types:")
        logger.log(15, json.dumps(self._types_of_features, indent=4))
        logger.log(15, "\n")
        self.processor = self._create_preprocessor(impute_strategy=impute_strategy, max_category_levels=max_category_levels)
        df = self.processor.fit_transform(df) # 2D numpy array
        self.feature_arraycol_map = self._get_feature_arraycol_map(max_category_levels=max_category_levels)  # OrderedDict of feature-name -> list of column-indices in df corresponding to this feature
        num_array_cols = np.sum([len(self.feature_arraycol_map[key]) for key in self.feature_arraycol_map])  # should match number of columns in processed array
        if num_array_cols != df.shape[1]:
            raise ValueError("Error during one-hot encoding data processing for neural network. Number of columns in df array does not match feature_arraycol_map.")

        self.feature_type_map = self._get_feature_type_map()  # OrderedDict of feature-name -> feature_type string (options: 'vector', 'embed', 'language')
        return TabularNNDataset(df, self.feature_arraycol_map, self.feature_type_map,
                                batch_size=batch_size, num_dataloading_workers=num_dataloading_workers,
                                problem_type=self.problem_type, labels=labels, is_test=False)

    def setup_trainer(self, params, train_dataset=None):
        """ Set up optimizer needed for training.
            Network must first be initialized before this.
        """
        optimizer_opts = {'learning_rate': params['learning_rate'], 'wd': params['weight_decay'], 'clip_gradient': params['clip_gradient']}
        if 'lr_scheduler' in params and params['lr_scheduler'] is not None:
            if train_dataset is None:
                raise ValueError("train_dataset cannot be None when lr_scheduler is specified.")
            base_lr = params.get('base_lr', 1e-6)
            target_lr = params.get('target_lr', 1.0)
            warmup_epochs = params.get('warmup_epochs', 10)
            lr_decay = params.get('lr_decay', 0.1)
            lr_mode = params['lr_scheduler']
            num_batches = train_dataset.num_examples // params['batch_size']
            lr_decay_epoch = [max(warmup_epochs, int(params['num_epochs']/3)), max(warmup_epochs+1, int(params['num_epochs']/2)),
                              max(warmup_epochs+2, int(2*params['num_epochs']/3))]
            lr_scheduler = LRSequential([
                LRScheduler('linear', base_lr=base_lr, target_lr=target_lr, nepochs=warmup_epochs, iters_per_epoch=num_batches),
                LRScheduler(lr_mode, base_lr=target_lr, target_lr=base_lr, nepochs=params['num_epochs'] - warmup_epochs,
                            iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2)
            ])
            optimizer_opts['lr_scheduler'] = lr_scheduler
        if params['optimizer'] == 'sgd':
            if 'momentum' in params:
                optimizer_opts['momentum'] = params['momentum']
            optimizer = gluon.Trainer(self.model.collect_params(), 'sgd', optimizer_opts)
        elif params['optimizer'] == 'adam':  # TODO: Can we try AdamW?
            optimizer = gluon.Trainer(self.model.collect_params(), 'adam', optimizer_opts)
        else:
            raise ValueError("Unknown optimizer specified: %s" % params['optimizer'])
        return optimizer

    @staticmethod
    def convert_df_dtype_to_str(df):
        return df.astype(str)

    def _get_feature_arraycol_map(self, max_category_levels):
        """ Returns OrderedDict of feature-name -> list of column-indices in processed data array corresponding to this feature """
        feature_preserving_transforms = set(['continuous','skewed', 'ordinal', 'language'])  # these transforms do not alter dimensionality of feature
        feature_arraycol_map = {}  # unordered version
        current_colindex = 0
        for transformer in self.processor.transformers_:
            transformer_name = transformer[0]
            transformed_features = transformer[2]
            if transformer_name in feature_preserving_transforms:
                for feature in transformed_features:
                    if feature in feature_arraycol_map:
                        raise ValueError("same feature is processed by two different column transformers: %s" % feature)
                    feature_arraycol_map[feature] = [current_colindex]
                    current_colindex += 1
            elif transformer_name == 'onehot':
                oh_encoder = [step for (name, step) in transformer[1].steps if name == 'onehot'][0]
                for i in range(len(transformed_features)):
                    feature = transformed_features[i]
                    if feature in feature_arraycol_map:
                        raise ValueError("same feature is processed by two different column transformers: %s" % feature)
                    oh_dimensionality = min(len(oh_encoder.categories_[i]), max_category_levels+1)
                    feature_arraycol_map[feature] = list(range(current_colindex, current_colindex+oh_dimensionality))
                    current_colindex += oh_dimensionality
            else:
                raise ValueError("unknown transformer encountered: %s" % transformer_name)
        return OrderedDict([(key, feature_arraycol_map[key]) for key in feature_arraycol_map])

    def _get_feature_type_map(self):
        """ Returns OrderedDict of feature-name -> feature_type string (options: 'vector', 'embed', 'language') """
        if self.feature_arraycol_map is None:
            raise ValueError("must first call _get_feature_arraycol_map() before _get_feature_type_map()")
        vector_features = self._types_of_features['continuous'] + self._types_of_features['skewed'] + self._types_of_features['onehot']
        feature_type_map = OrderedDict()
        for feature_name in self.feature_arraycol_map:
            if feature_name in vector_features:
                feature_type_map[feature_name] = 'vector'
            elif feature_name in self._types_of_features['embed']:
                feature_type_map[feature_name] = 'embed'
            elif feature_name in self._types_of_features['language']:
                feature_type_map[feature_name] = 'language'
            else:
                raise ValueError("unknown feature type encountered")
        return feature_type_map

    def _create_preprocessor(self, impute_strategy, max_category_levels):
        """ Defines data encoders used to preprocess different data types and creates instance variable which is sklearn ColumnTransformer object """
        if self.processor is not None:
            Warning("Attempting to process training data for TabularNeuralNetModel, but previously already did this.")
        continuous_features = self._types_of_features['continuous']
        skewed_features = self._types_of_features['skewed']
        onehot_features = self._types_of_features['onehot']
        embed_features = self._types_of_features['embed']
        language_features = self._types_of_features['language']
        transformers = []  # order of various column transformers in this list is important!
        if continuous_features:
            continuous_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=impute_strategy)),
                ('scaler', StandardScaler())])
            transformers.append( ('continuous', continuous_transformer, continuous_features) )
        if skewed_features:
            power_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=impute_strategy)),
                ('quantile', QuantileTransformer(output_distribution='normal')) ])  # Or output_distribution = 'uniform'
            transformers.append( ('skewed', power_transformer, skewed_features) )
        if onehot_features:
            onehot_transformer = Pipeline(steps=[
                # TODO: Consider avoiding converting to string for improved memory efficiency
                ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
                ('onehot', OneHotMergeRaresHandleUnknownEncoder(max_levels=max_category_levels, sparse=False))])  # test-time unknown values will be encoded as all zeros vector
            transformers.append( ('onehot', onehot_transformer, onehot_features) )
        if embed_features:  # Ordinal transformer applied to convert to-be-embedded categorical features to integer levels
            ordinal_transformer = Pipeline(steps=[
                ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
                ('ordinal', OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))])  # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories.
            transformers.append( ('ordinal', ordinal_transformer, embed_features) )
        if language_features:
            raise NotImplementedError("language_features cannot be used at the moment")
        return ColumnTransformer(transformers=transformers)  # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.

    def save(self, path: str = None, verbose=True) -> str:
        if self.model is not None:
            self._architecture_desc = self.model.architecture_desc
        temp_model = self.model
        temp_sw = self.summary_writer
        self.model = None
        self.summary_writer = None
        path_final = super().save(path=path, verbose=verbose)
        self.model = temp_model
        self.summary_writer = temp_sw
        self._architecture_desc = None

        # Export model
        if self.model is not None:
            params_filepath = path_final + self.params_file_name
            # TODO: Don't use os.makedirs here, have save_parameters function in tabular_nn_model that checks if local path or S3 path
            os.makedirs(os.path.dirname(path_final), exist_ok=True)
            self.model.save_parameters(params_filepath)
        return path_final

    @classmethod
    def load(cls, path: str, reset_paths=True, verbose=True):
        model: TabularNeuralNetModel = super().load(path=path, reset_paths=reset_paths, verbose=verbose)
        if model._architecture_desc is not None:
            model.model = EmbedNet(architecture_desc=model._architecture_desc, ctx=model.ctx)  # recreate network from architecture description
            model._architecture_desc = None
            # TODO: maybe need to initialize/hybridize?
            model.model.load_parameters(model.path + model.params_file_name, ctx=model.ctx)
            model.summary_writer = None
        return model

    def hyperparameter_tune(self, X_train, y_train, X_val, y_val, scheduler_options, **kwargs):
        time_start = time.time()
        """ Performs HPO and sets self.params to best hyperparameter values """
        self.verbosity = kwargs.get('verbosity', 2)
        logger.log(15, "Beginning hyperparameter tuning for Neural Network...")
        self._set_default_searchspace()  # changes non-specified default hyperparams from fixed values to search-spaces.
        if self.feature_metadata is None:
            raise ValueError("Trainer class must set feature_metadata for this model")
        scheduler_func = scheduler_options[0]
        scheduler_options = scheduler_options[1]
        if scheduler_func is None or scheduler_options is None:
            raise ValueError("scheduler_func and scheduler_options cannot be None for hyperparameter tuning")
        num_cpus = scheduler_options['resource']['num_cpus']
        # num_gpus = scheduler_options['resource']['num_gpus']  # TODO: Currently unused

        params_copy = self.params.copy()

        self.num_dataloading_workers = max(1, int(num_cpus/2.0))
        self.batch_size = params_copy['batch_size']
        train_dataset, val_dataset = self.generate_datasets(X_train=X_train, y_train=y_train, params=params_copy, X_val=X_val, y_val=y_val)
        train_path = self.path + "train"
        val_path = self.path + "validation"
        train_dataset.save(file_prefix=train_path)
        val_dataset.save(file_prefix=val_path)

        if not np.any([isinstance(params_copy[hyperparam], Space) for hyperparam in params_copy]):
            logger.warning("Warning: Attempting to do hyperparameter optimization without any search space (all hyperparameters are already fixed values)")
        else:
            logger.log(15, "Hyperparameter search space for Neural Network: ")
            for hyperparam in params_copy:
                if isinstance(params_copy[hyperparam], Space):
                    logger.log(15, str(hyperparam)+ ":   "+str(params_copy[hyperparam]))

        util_args = dict(
            train_path=train_path,
            val_path=val_path,
            model=self,
            time_start=time_start,
            time_limit=scheduler_options['time_out']
        )
        tabular_nn_trial.register_args(util_args=util_args, **params_copy)
        scheduler = scheduler_func(tabular_nn_trial, **scheduler_options)
        if ('dist_ip_addrs' in scheduler_options) and (len(scheduler_options['dist_ip_addrs']) > 0):
            # TODO: Ensure proper working directory setup on remote machines
            # This is multi-machine setting, so need to copy dataset to workers:
            logger.log(15, "Uploading preprocessed data to remote workers...")
            scheduler.upload_files([train_path+TabularNNDataset.DATAOBJ_SUFFIX,
                                train_path+TabularNNDataset.DATAVALUES_SUFFIX,
                                val_path+TabularNNDataset.DATAOBJ_SUFFIX,
                                val_path+TabularNNDataset.DATAVALUES_SUFFIX])  # TODO: currently does not work.
            logger.log(15, "uploaded")

        scheduler.run()
        scheduler.join_jobs()
        scheduler.get_training_curves(plot=False, use_legend=False)

        return self._get_hpo_results(scheduler=scheduler, scheduler_options=scheduler_options, time_start=time_start)

    def get_info(self):
        info = super().get_info()
        info['hyperparameters_post_fit'] = self.params_post_fit
        return info

    def reduce_memory_size(self, remove_fit=True, requires_save=True, **kwargs):
        super().reduce_memory_size(remove_fit=remove_fit, requires_save=requires_save, **kwargs)
        if remove_fit and requires_save:
            self.optimizer = None
Ejemplo n.º 20
0
def do_training(args, module, data_train, data_val, begin_epoch=0):
    from distutils.dir_util import mkpath
    from log_util import LogUtil

    log = LogUtil.getInstance().getlogger()
    mkpath(os.path.dirname(get_checkpoint_path(args)))

    #seq_len = args.config.get('arch', 'max_t_count')
    batch_size = args.config.getint('common', 'batch_size')
    save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch')
    save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch')
    enable_logging_train_metric = args.config.getboolean('train', 'enable_logging_train_metric')
    enable_logging_validation_metric = args.config.getboolean('train', 'enable_logging_validation_metric')

    contexts = parse_contexts(args)
    num_gpu = len(contexts)
    eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric,is_epoch_end=True)
    # mxboard setting
    loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric,is_epoch_end=False)

    optimizer = args.config.get('optimizer', 'optimizer')
    learning_rate = args.config.getfloat('train', 'learning_rate')
    learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing')

    mode = args.config.get('common', 'mode')
    num_epoch = args.config.getint('train', 'num_epoch')
    clip_gradient = args.config.getfloat('optimizer', 'clip_gradient')
    weight_decay = args.config.getfloat('optimizer', 'weight_decay')
    save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states')
    show_every = args.config.getint('train', 'show_every')
    optimizer_params_dictionary = json.loads(args.config.get('optimizer', 'optimizer_params_dictionary'))
    kvstore_option = args.config.get('common', 'kvstore_option')
    n_epoch=begin_epoch
    is_bucketing = args.config.getboolean('arch', 'is_bucketing')

    if clip_gradient == 0:
        clip_gradient = None
    if is_bucketing and mode == 'load':
        model_file = args.config.get('common', 'model_file')
        model_name = os.path.splitext(model_file)[0]
        model_num_epoch = int(model_name[-4:])

        model_path = 'checkpoints/' + str(model_name[:-5])
        symbol, data_names, label_names = module(1600)
        model = STTBucketingModule(
            sym_gen=module,
            default_bucket_key=data_train.default_bucket_key,
            context=contexts)
        data_train.reset()

        model.bind(data_shapes=data_train.provide_data,
                   label_shapes=data_train.provide_label,
                   for_training=True)
        _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch)
        model.set_params(arg_params, aux_params)
        module = model
    else:
        module.bind(data_shapes=data_train.provide_data,
                label_shapes=data_train.provide_label,
                for_training=True)

    if begin_epoch == 0 and mode == 'train':
        module.init_params(initializer=get_initializer(args))


    lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate)

    def reset_optimizer(force_init=False):
        optimizer_params = {'lr_scheduler': lr_scheduler,
                            'clip_gradient': clip_gradient,
                            'wd': weight_decay}
        optimizer_params.update(optimizer_params_dictionary)
        module.init_optimizer(kvstore=kvstore_option,
                              optimizer=optimizer,
                              optimizer_params=optimizer_params,
                              force_init=force_init)
    if mode == "train":
        reset_optimizer(force_init=True)
    else:
        reset_optimizer(force_init=False)
        data_train.reset()
        data_train.is_first_epoch = True

    #mxboard setting
    mxlog_dir = args.config.get('common', 'mxboard_log_dir')
    summary_writer = SummaryWriter(mxlog_dir)

    while True:

        if n_epoch >= num_epoch:
            break
        loss_metric.reset()
        log.info('---------train---------')
        for nbatch, data_batch in enumerate(data_train):
            module.forward_backward(data_batch)
            module.update()
            # mxboard setting
            if (nbatch + 1) % show_every == 0:
                module.update_metric(loss_metric, data_batch.label)
            #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch)
            if (nbatch+1) % save_checkpoint_every_n_batch == 0:
                log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch)
                module.save_checkpoint(prefix=get_checkpoint_path(args)+"n_epoch"+str(n_epoch)+"n_batch", epoch=(int((nbatch+1)/save_checkpoint_every_n_batch)-1), save_optimizer_states=save_optimizer_states)
        # commented for Libri_sample data set to see only train cer
        log.info('---------validation---------')
        data_val.reset()
        eval_metric.reset()
        for nbatch, data_batch in enumerate(data_val):
            # when is_train = False it leads to high cer when batch_norm
            module.forward(data_batch, is_train=True)
            module.update_metric(eval_metric, data_batch.label)

        # mxboard setting
        val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value()
        log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label)
        curr_acc = val_cer
        summary_writer.add_scalar('CER validation', val_cer, n_epoch)
        assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric'

        data_train.reset()
        data_train.is_first_epoch = False

        # mxboard setting
        train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value()
        summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch)
        summary_writer.add_scalar('CER train', train_cer, n_epoch)

        # save checkpoints
        if n_epoch % save_checkpoint_every_n_epoch == 0:
            log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch)
            module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states)

        n_epoch += 1

        lr_scheduler.learning_rate=learning_rate/learning_rate_annealing

    log.info('FINISH')
Ejemplo n.º 21
0
        # in case our last batch was the tail batch of the dataloader,
        # make sure we feed a full batch of noise
        noise =  mx.ndarray.random.normal(shape=(opt.batchSize, nz, 1, 1), ctx=context)
        with autograd.record():
            fake = netG(noise)
            errG = netD(fake)
            errG.backward()
        trainerG.step(1)
        gen_iterations += 1

        print('[%d/%d][%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f'
            % (epoch, opt.niter, i, len(dataloader), gen_iterations,
            errD.asnumpy()[0], errG.asnumpy()[0], errD_real.asnumpy()[0], errD_fake.asnumpy()[0]))

        sw.add_scalar(
            tag='loss_D',
            value=-errD.asnumpy()[0],
            global_step=gen_iterations
        )

        if gen_iterations % 500 == 0:
            real_cpu = data * 0.5 + 0.5
            save_images(real_cpu.asnumpy().transpose(0, 2, 3, 1), '{0}/real_samples.png'.format(opt.experiment))
            fake = netG(fixed_noise.as_in_context(context))
            fake = fake * 0.5 + 0.5
            save_images(fake.asnumpy().transpose(0, 2, 3, 1), '{0}/fake_samples_{1}.png'.format(opt.experiment, gen_iterations))

    # do checkpointing
    netG.save_params('{0}/netG_epoch_{1}.pth'.format(opt.experiment, epoch))
    netD.save_params('{0}/netD_epoch_{1}.pth'.format(opt.experiment, epoch))
Ejemplo n.º 22
0
def model_train(blocks, args, dataset, cheb_polys, ctx, logdir='./logdir'):
    '''
    Parameters
    ----------
    blocks: list[list], model structure, e.g. [[1, 32, 64], [64, 32, 128]]

    args: argparse.Namespace

    dataset: Dataset

    cheb_polys: mx.ndarray,
                shape is (num_of_vertices, order_of_cheb * num_of_vertices)

    ctx: mx.context.Context

    logdir: str, path of mxboard logdir

    '''

    num_of_vertices = args.num_of_vertices
    n_his, n_pred = args.n_his, args.n_pred
    order_of_cheb, Kt = args.order_of_cheb, args.kt
    batch_size, epochs = args.batch_size, args.epochs
    opt = args.opt
    keep_prob = args.keep_prob

    # data
    train = dataset['train'].transpose((0, 3, 1, 2))
    val = dataset['val'].transpose((0, 3, 1, 2))
    test = dataset['test'].transpose((0, 3, 1, 2))

    train_x, train_y = train[:, :, :n_his, :], train[:, :, n_his:, :]
    val_x, val_y = val[:, :, :n_his, :], val[:, :, n_his:, :]
    test_x, test_y = test[:, :, :n_his, :], test[:, :, n_his:, :]

    print(train_x.shape, train_y.shape, val_x.shape, val_y.shape, test_x.shape,
          test_y.shape)

    train_loader = gluon.data.DataLoader(gluon.data.ArrayDataset(
        nd.array(train_x), nd.array(train_y)),
                                         batch_size=batch_size,
                                         shuffle=False)
    val_loader = gluon.data.DataLoader(gluon.data.ArrayDataset(
        nd.array(val_x), nd.array(val_y)),
                                       batch_size=batch_size,
                                       shuffle=False)
    test_loader = gluon.data.DataLoader(gluon.data.ArrayDataset(
        nd.array(test_x), nd.array(test_y)),
                                        batch_size=batch_size,
                                        shuffle=False)

    ground_truth = (
        np.concatenate([y.asnumpy()
                        for x, y in test_loader], axis=0) * dataset.std +
        dataset.mean)

    # model
    model = hybrid_model.STGCN(n_his=n_his,
                               order_of_cheb=order_of_cheb,
                               Kt=Kt,
                               blocks=blocks,
                               keep_prob=keep_prob,
                               num_of_vertices=num_of_vertices,
                               cheb_polys=cheb_polys)
    model.initialize(ctx=ctx, init=mx.init.Xavier())
    model.hybridize()

    # loss function
    loss = gluon.loss.L2Loss()

    # trainer
    trainer = gluon.Trainer(model.collect_params(), args.opt)
    trainer.set_learning_rate(args.lr)

    if not os.path.exists('params'):
        os.mkdir('params')

    sw = SummaryWriter(logdir=logdir, flush_secs=5)
    train_step = 0
    val_step = 0

    for epoch in range(epochs):
        start_time = time.time()
        for x, y in train_loader:
            tmp = nd.concat(x, y, dim=2)
            for pred_idx in range(n_pred):
                end_idx = pred_idx + n_his
                x_ = tmp[:, :, pred_idx:end_idx, :]
                y_ = tmp[:, :, end_idx:end_idx + 1, :]
                with autograd.record():
                    l = loss(model(x_.as_in_context(ctx)),
                             y_.as_in_context(ctx))
                l.backward()
                sw.add_scalar(tag='training_loss',
                              value=l.mean().asscalar(),
                              global_step=train_step)
                trainer.step(x.shape[0])
                train_step += 1

        val_loss_list = []
        for x, y in val_loader:
            pred = predict_batch(model, ctx, x, n_pred)
            val_loss_list.append(loss(pred, y).mean().asscalar())
        sw.add_scalar(tag='val_loss',
                      value=sum(val_loss_list) / len(val_loss_list),
                      global_step=val_step)

        evaluate(model, ctx, ground_truth, test_loader, n_pred, dataset.mean,
                 dataset.std, sw, val_step)
        val_step += 1

        if (epoch + 1) % args.save == 0:
            model.save_parameters('params/{}.params'.format(epoch + 1))

    sw.close()
Ejemplo n.º 23
0
class TrainerAgentMXNET:  # Probably needs refactoring
    """Main training loop"""
    def __init__(self,
                 model,
                 symbol,
                 val_iter,
                 train_config: TrainConfig,
                 train_objects: TrainObjects,
                 use_rtpt: bool,
                 augment=False):
        """
        Class for training the neural network.
        :param model: The model loaded with the MXNet Module functionalities.
        :param symbol: The architecture of the neural network.
        :param val_iter: Iteratable object over the validation data.
        :param train_config: An instance of the TrainConfig data class.
        :param train_objects: Am omstamce pf the TrainObject data class.
        :param use_rtpt: If True, an RTPT object will be created and modified within this class.
        """
        # Too many instance attributes (29/7) - Too many arguments (24/5) - Too many local variables (25/15)
        # Too few public methods (1/2)
        self.tc = train_config
        self.to = train_objects
        if self.to.metrics is None:
            self.to.metrics = {}
        self._model = model
        self._symbol = symbol
        self._val_iter = val_iter
        self.x_train = self.yv_train = self.yp_train = None
        self._ctx = get_context(train_config.context, train_config.device_id)
        self._augment = augment

        # define a summary writer that logs data and flushes to the file every 5 seconds
        if self.tc.log_metrics_to_tensorboard:
            self.sum_writer = SummaryWriter(logdir=self.tc.export_dir + "logs",
                                            flush_secs=5,
                                            verbose=False)
        # Define the optimizer
        if self.tc.optimizer_name == "adam":
            self.optimizer = mx.optimizer.Adam(
                learning_rate=0.001,
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-8,
                lazy_update=True,
                rescale_grad=(1.0 / self.tc.batch_size))
        elif self.tc.optimizer_name == "nag":
            self.optimizer = mx.optimizer.NAG(
                momentum=self.to.momentum_schedule(0),
                wd=self.tc.wd,
                rescale_grad=(1.0 / self.tc.batch_size))
        else:
            raise Exception("%s is currently not supported as an optimizer." %
                            self.tc.optimizer_name)
        self.ordering = list(
            range(self.tc.nb_parts)
        )  # define a list which describes the order of the processed batches
        # if we augment the data set each part is loaded twice
        if self._augment:
            self.ordering += self.ordering
        # decides if the policy indices shall be selected directly from spatial feature maps without dense layer
        self.batch_end_callbacks = [self.batch_callback]

        # few variables which are internally used
        self.val_loss_best = self.val_p_acc_best = self.k_steps_best = \
            self.old_label = self.value_out = self.t_s = None
        self.patience_cnt = self.batch_proc_tmp = None
        # calculate how many log states will be processed
        self.k_steps_end = round(self.tc.total_it / self.tc.batch_steps)
        if self.k_steps_end == 0:
            self.k_steps_end = 1
        self.k_steps = self.cur_it = self.nb_spikes = self.old_val_loss = self.continue_training = self.t_s_steps = None
        self._train_iter = self.graph_exported = self.val_metric_values = self.val_loss = self.val_p_acc = None
        self.val_metric_values_best = None

        self.use_rtpt = use_rtpt

        if use_rtpt:
            # we use k-steps instead of epochs here
            self.rtpt = RTPT(name_initials=self.tc.name_initials,
                             experiment_name='crazyara',
                             max_iterations=self.k_steps_end -
                             self.tc.k_steps_initial)

    def _log_metrics(self, metric_values, global_step, prefix="train_"):
        """
        Logs a dictionary object of metric value to the console and to tensorboard
        if _log_metrics_to_tensorboard is set to true
        :param metric_values: Dictionary object storing the current metrics
        :param global_step: X-Position point of all metric entries
        :param prefix: Used for labelling the metrics
        :return:
        """
        for name in metric_values.keys():  # show the metric stats
            print(" - %s%s: %.4f" % (prefix, name, metric_values[name]),
                  end="")
            # add the metrics to the tensorboard event file
            if self.tc.log_metrics_to_tensorboard:
                self.sum_writer.add_scalar(
                    name, [prefix.replace("_", ""), metric_values[name]],
                    global_step)

    def train(self, cur_it=None):  # Probably needs refactoring
        """
        Training model
        :param cur_it: Current iteration which is used for the learning rate and momentum schedule.
         If set to None it will be initialized
        :return: return_metrics_and_stop_training()
        """
        # Too many local variables (44/15) - Too many branches (18/12) - Too many statements (108/50)
        # set a custom seed for reproducibility
        if self.tc.seed is not None:
            random.seed(self.tc.seed)
        # define and initialize the variables which will be used
        self.t_s = time()
        # track on how many batches have been processed in this epoch
        self.patience_cnt = epoch = self.batch_proc_tmp = 0
        self.k_steps = self.tc.k_steps_initial  # counter for thousands steps

        if cur_it is None:
            self.cur_it = self.tc.k_steps_initial * 1000
        else:
            self.cur_it = cur_it
        self.nb_spikes = 0  # count the number of spikes that have been detected
        # initialize the loss to compare with, with a very high value
        self.old_val_loss = 9000
        self.graph_exported = False  # create a state variable to check if the net architecture has been reported yet
        self.continue_training = True
        self.optimizer.lr = self.to.lr_schedule(self.cur_it)
        if self.tc.optimizer_name == "nag":
            self.optimizer.momentum = self.to.momentum_schedule(self.cur_it)

        if not self.ordering:  # safety check to prevent eternal loop
            raise Exception(
                "You must have at least one part file in your planes-dataset directory!"
            )

        if self.use_rtpt:
            # Start the RTPT tracking
            self.rtpt.start()

        while self.continue_training:  # Too many nested blocks (7/5)
            # reshuffle the ordering of the training game batches (shuffle works in place)
            random.shuffle(self.ordering)

            epoch += 1
            logging.info("EPOCH %d", epoch)
            logging.info("=========================")
            self.t_s_steps = time()
            self._model.init_optimizer(optimizer=self.optimizer)

            if self._augment:
                # stores part ids that were not augmented yet
                parts_not_augmented = list(set(self.ordering.copy()))
                # stores part ids that were loaded before but not augmented
                parts_to_augment = []

            for part_id in tqdm_notebook(self.ordering):

                if MODE == MODE_XIANGQI:
                    _, self.x_train, self.yv_train, self.yp_train, _ = load_xiangqi_dataset(
                        dataset_type="train",
                        part_id=part_id,
                        normalize=self.tc.normalize,
                        verbose=False)
                    if self._augment:
                        # check whether the current part should be augmented
                        if part_id in parts_to_augment:
                            augment(self.x_train, self.yp_train)
                            logging.debug(
                                "Using augmented part with id {}".format(
                                    part_id))
                        elif part_id in parts_not_augmented:
                            if random.randint(0, 1):
                                augment(self.x_train, self.yp_train)
                                parts_not_augmented.remove(part_id)
                                logging.debug(
                                    "Using augmented part with id {}".format(
                                        part_id))
                            else:
                                parts_to_augment.append(part_id)
                                logging.debug(
                                    "Using unaugmented part with id {}".format(
                                        part_id))
                else:
                    # load one chunk of the dataset from memory
                    _, self.x_train, self.yv_train, self.yp_train, plys_to_end, _ = load_pgn_dataset(
                        dataset_type="train",
                        part_id=part_id,
                        normalize=self.tc.normalize,
                        verbose=False,
                        q_value_ratio=self.tc.q_value_ratio)
                # fill_up_batch if there aren't enough games
                if len(self.yv_train) < self.tc.batch_size:
                    logging.info("filling up batch with too few samples %d" %
                                 len(self.yv_train))
                    self.x_train = fill_up_batch(self.x_train,
                                                 self.tc.batch_size)
                    self.yv_train = fill_up_batch(self.yv_train,
                                                  self.tc.batch_size)
                    self.yp_train = fill_up_batch(self.yp_train,
                                                  self.tc.batch_size)
                    if MODE != MODE_XIANGQI:
                        if plys_to_end is not None:
                            plys_to_end = fill_up_batch(
                                plys_to_end, self.tc.batch_size)

                if MODE != MODE_XIANGQI:
                    if self.tc.discount != 1:
                        self.yv_train *= self.tc.discount**plys_to_end
                self.yp_train = prepare_policy(
                    self.yp_train, self.tc.select_policy_from_plane,
                    self.tc.sparse_policy_label,
                    self.tc.is_policy_from_plane_data)

                if self.tc.use_wdl and self.tc.use_plys_to_end:
                    self._train_iter = mx.io.NDArrayIter(
                        {'data': self.x_train}, {
                            'value_label': self.yv_train,
                            'policy_label': self.yp_train,
                            'wdl_label': value_to_wdl_label(self.yv_train),
                            'plys_to_end_label':
                            prepare_plys_label(plys_to_end)
                        },
                        self.tc.batch_size,
                        shuffle=True)
                else:
                    self._train_iter = mx.io.NDArrayIter(
                        {'data': self.x_train}, {
                            'value_label': self.yv_train,
                            'policy_label': self.yp_train
                        },
                        self.tc.batch_size,
                        shuffle=True)

                # avoid memory leaks by adding synchronization
                mx.nd.waitall()

                reset_metrics(self.to.metrics)
                for batch in self._train_iter:
                    self._model.forward(batch,
                                        is_train=True)  # compute predictions
                    for metric in self.to.metrics:  # update the metrics
                        self._model.update_metric(metric, batch.label)

                    self._model.backward()
                    # compute gradients
                    self._model.update()  # update parameters
                    self.batch_callback()

                    if not self.continue_training:
                        logging.info('Elapsed time for training(hh:mm:ss): ' +
                                     str(
                                         datetime.timedelta(
                                             seconds=round(time() -
                                                           self.t_s))))

                        return return_metrics_and_stop_training(
                            self.k_steps, self.val_metric_values,
                            self.k_steps_best, self.val_metric_values_best)

                # add the graph representation of the network to the tensorboard log file
                if not self.graph_exported and self.tc.log_metrics_to_tensorboard:
                    # self.sum_writer.add_graph(self._symbol)
                    self.graph_exported = True

    def _fill_train_metrics(self):
        """
        Fills in the training metrics
        :return:
        """
        self.train_metric_values = {}
        for metric in self.to.metrics:
            name, value = metric.get()
            self.train_metric_values[name] = value

        self.train_metric_values["loss"] = 0.01 * self.train_metric_values["value_loss"] + \
                                           0.99 * self.train_metric_values["policy_loss"]

    def recompute_eval(self):
        """
        Recomputes the score on the validataion data
        :return:
        """
        ms_step = ((time() - self.t_s_steps) / self.tc.batch_steps) * 1000
        logging.info("Step %dK/%dK - %dms/step", self.k_steps,
                     self.k_steps_end, ms_step)
        logging.info("-------------------------")
        logging.debug("Iteration %d/%d", self.cur_it, self.tc.total_it)
        if self.tc.optimizer_name == "nag":
            logging.debug("lr: %.7f - momentum: %.7f", self.optimizer.lr,
                          self.optimizer.momentum)
        else:
            logging.debug("lr: %.7f - momentum: -", self.optimizer.lr)

        # the metric values have already been computed during training for the train set
        self._fill_train_metrics()

        self.val_metric_values = evaluate_metrics(
            self.to.metrics,
            self._val_iter,
            self._model,
        )
        if self.use_rtpt:
            # update process title according to loss
            self.rtpt.step(
                subtitle=f"loss={self.val_metric_values['loss']:2.2f}")
        if self.tc.use_spike_recovery and (
                self.old_val_loss * self.tc.spike_thresh <
                self.val_metric_values["loss"] or np.isnan(
                    self.val_metric_values["loss"])):  # check for spikes
            self.handle_spike()
        else:
            self.update_eval()

    def handle_spike(self):
        """
        Handles the occurence of a spike during training, in the case validation loss increased dramatically.
        :return: self._return_metrics_and_stop_training()
        """
        self.nb_spikes += 1
        logging.warning(
            "Spike %d/%d occurred - val_loss: %.3f",
            self.nb_spikes,
            self.tc.max_spikes,
            self.val_metric_values["loss"],
        )
        if self.nb_spikes >= self.tc.max_spikes:
            val_loss = self.val_metric_values["loss"]
            val_p_acc = self.val_metric_values["policy_acc"]
            # finally stop training because the number of lr drops has been achieved

            logging.debug(
                "The maximum number of spikes has been reached. Stop training."
            )
            self.continue_training = False

            if self.tc.log_metrics_to_tensorboard:
                self.sum_writer.close()
            return return_metrics_and_stop_training(
                self.k_steps, self.val_metric_values, self.k_steps_best,
                self.val_metric_values_best)

        logging.debug("Recover to latest checkpoint")
        # Load the best model once again
        prefix = self.tc.export_dir + "weights/model-%.5f-%.3f" % (
            self.val_loss_best, self.val_p_acc_best)

        logging.debug("load current best model:%s", prefix)
        self._model.load(prefix, epoch=self.k_steps_best)

        self.k_steps = self.k_steps_best
        logging.debug("k_step is back at %d", self.k_steps_best)
        # print the elapsed time
        t_delta = time() - self.t_s_steps
        print(" - %.ds" % t_delta)
        self.t_s_steps = time()

    def update_eval(self):
        """
        Updates the evaluation metrics
        :return:
        """
        # update the val_loss_value to compare with using spike recovery
        self.old_val_loss = self.val_metric_values["loss"]
        # log the metric values to tensorboard
        self._log_metrics(self.train_metric_values,
                          global_step=self.k_steps,
                          prefix="train_")
        self._log_metrics(self.val_metric_values,
                          global_step=self.k_steps,
                          prefix="val_")

        # check if a new checkpoint shall be created
        if self.val_loss_best is None or self.val_metric_values[
                "loss"] < self.val_loss_best:
            # update val_loss_best
            self.val_loss_best = self.val_metric_values["loss"]
            self.val_p_acc_best = self.val_metric_values["policy_acc"]
            self.val_metric_values_best = self.val_metric_values
            self.k_steps_best = self.k_steps

            if self.tc.export_weights:
                prefix = self.tc.export_dir + "weights/model-%.5f-%.3f" % (
                    self.val_loss_best, self.val_p_acc_best)
                # the export function saves both the architecture and the weights
                print()
                self._model.save_checkpoint(prefix, epoch=self.k_steps_best)

            self.patience_cnt = 0  # reset the patience counter
        # print the elapsed time
        t_delta = time() - self.t_s_steps
        print(" - %.ds" % t_delta)
        self.t_s_steps = time()

        # log the samples per second metric to tensorboard
        self.sum_writer.add_scalar(
            tag="samples_per_second",
            value={
                "hybrid_sync":
                self.tc.batch_size * self.tc.batch_steps / t_delta
            },
            global_step=self.k_steps,
        )

        # log the current learning rate
        self.sum_writer.add_scalar(tag="lr",
                                   value=self.to.lr_schedule(self.cur_it),
                                   global_step=self.k_steps)
        if self.tc.optimizer_name == "nag":
            # log the current momentum value
            self.sum_writer.add_scalar(tag="momentum",
                                       value=self.to.momentum_schedule(
                                           self.cur_it),
                                       global_step=self.k_steps)

        if self.cur_it >= self.tc.total_it:

            self.continue_training = False

            self.val_loss = self.val_metric_values["loss"]
            self.val_p_acc = self.val_metric_values["policy_acc"]
            # finally stop training because the number of lr drops has been achieved
            logging.debug("The number of given iterations has been reached")

            if self.tc.log_metrics_to_tensorboard:
                self.sum_writer.close()

    def batch_callback(self):
        """
        Callback which is executed after every batch to update the momentum and learning rate
        :return:
        """

        # update the learning rate and momentum
        self.optimizer.lr = self.to.lr_schedule(self.cur_it)
        if self.tc.optimizer_name == "nag":
            self.optimizer.momentum = self.to.momentum_schedule(self.cur_it)

        self.cur_it += 1
        self.batch_proc_tmp += 1

        if self.batch_proc_tmp >= self.tc.batch_steps:  # show metrics every thousands steps
            self.batch_proc_tmp = self.batch_proc_tmp - self.tc.batch_steps
            # update the counters
            self.k_steps += 1
            self.patience_cnt += 1
            self.recompute_eval()
            self.custom_metric_eval()

    def custom_metric_eval(self):
        """
        Evaluates the model based on the validation set of different variants
        """

        if self.to.variant_metrics is None:
            return

        for part_id, variant_name in enumerate(self.to.variant_metrics):
            # load one chunk of the dataset from memory
            _, x_val, yv_val, yp_val, _, _ = load_pgn_dataset(
                dataset_type="val",
                part_id=part_id,
                normalize=self.tc.normalize,
                verbose=False,
                q_value_ratio=self.tc.q_value_ratio)

            if self.tc.select_policy_from_plane:
                val_iter = mx.io.NDArrayIter({'data': x_val}, {
                    'value_label':
                    yv_val,
                    'policy_label':
                    np.array(FLAT_PLANE_IDX)[yp_val.argmax(axis=1)]
                }, self.tc.batch_size)
            else:
                val_iter = mx.io.NDArrayIter(
                    {'data': x_val}, {
                        'value_label': yv_val,
                        'policy_label': yp_val.argmax(axis=1)
                    }, self.tc.batch_size)

            results = self._model.score(val_iter, self.to.metrics)
            prefix = "val_"

            for entry in results:
                name = variant_name + "_" + entry[0]
                value = entry[1]
                print(" - %s%s: %.4f" % (prefix, name, value), end="")
                # add the metrics to the tensorboard event file
                if self.tc.log_metrics_to_tensorboard:
                    self.sum_writer.add_scalar(
                        name, [prefix.replace("_", ""), value], self.k_steps)
        print()
Ejemplo n.º 24
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_size=16,
        batch_log=100,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        lambda_off=1,
        lambda_size=0.1,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=18,
        pretrained_base=True,
        pretrained_path="modelparam",
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        using_mlflow=True,
        topk=100,
        plot_class_thresh=0.5):
    '''
    AMP 가 모든 연산을 지원하지는 않는다.
    modulated convolution을 지원하지 않음
    '''
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Center Detector")
    input_shape = (1, 3) + tuple(input_size)

    scale_factor = 4  # 고정
    logging.info(f"scale factor {scale_factor}")

    try:
        train_dataloader, train_dataset = traindataloader(
            multiscale=multiscale,
            factor_scale=factor_scale,
            augmentation=data_augmentation,
            path=train_dataset_path,
            input_size=input_size,
            batch_size=batch_size,
            batch_interval=batch_interval,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            scale_factor=scale_factor,
            make_target=True)

    except Exception as E:
        logging.info(E)
        exit(0)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    if pretrained_base:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base)
    else:
        model = str(input_size[0]) + "_" + str(
            input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base)

    weight_path = f"weights/{model}"
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)} weights\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = CenterNet(base=base,
                        heads=OrderedDict([('heatmap', {
                            'num_output': num_classes,
                            'bias': -2.19
                        }), ('offset', {
                            'num_output': 2
                        }), ('wh', {
                            'num_output': 2
                        })]),
                        head_conv_channel=64,
                        pretrained=pretrained_base,
                        root=pretrained_path,
                        use_dcnv2=False,
                        ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'

    if AMP:
        '''
        update_on_kvstore : bool, default None
        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
        '''
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "beta1": 0.9,
                    "beta2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "gamma1": 0.9,
                    "gamma2": 0.999,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(
                net.collect_params(),
                optimizer,
                optimizer_params={
                    "learning_rate": learning_rate,
                    "lr_scheduler": lr_sch,
                    "wd": 0.0001,
                    "momentum": 0.9,
                    'multi_precision': False
                },
                update_on_kvstore=False)  # for Dynamic loss scaling
        else:
            logging.error("optimizer not selected")
            exit(0)

        amp.init_trainer(trainer)

    else:
        if optimizer.upper() == "ADAM":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "beta1": 0.9,
                                        "beta2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "RMSPROP":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "gamma1": 0.9,
                                        "gamma2": 0.999,
                                        'multi_precision': False
                                    })
        elif optimizer.upper() == "SGD":
            trainer = gluon.Trainer(net.collect_params(),
                                    optimizer,
                                    optimizer_params={
                                        "learning_rate": learning_rate,
                                        "lr_scheduler": lr_sch,
                                        "wd": 0.0001,
                                        "momentum": 0.9,
                                        'multi_precision': False
                                    })

        else:
            logging.error("optimizer not selected")
            exit(0)

    heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4)
    normedl1loss = NormedL1Loss()
    prediction = Prediction(batch_size=valid_size,
                            topk=topk,
                            scale=scale_factor)
    precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes)

    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        heatmap_loss_sum = 0
        offset_loss_sum = 0
        wh_loss_sum = 0
        time_stamp = time.time()
        '''
        target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. 
        '''

        for batch_count, (image, _, heatmap, offset_target, wh_target,
                          mask_target, _) in enumerate(train_dataloader,
                                                       start=1):
            td_batch_size = image.shape[0]

            image_split = mx.nd.split(data=image,
                                      num_outputs=subdivision,
                                      axis=0)
            heatmap_split = mx.nd.split(data=heatmap,
                                        num_outputs=subdivision,
                                        axis=0)
            offset_target_split = mx.nd.split(data=offset_target,
                                              num_outputs=subdivision,
                                              axis=0)
            wh_target_split = mx.nd.split(data=wh_target,
                                          num_outputs=subdivision,
                                          axis=0)
            mask_target_split = mx.nd.split(data=mask_target,
                                            num_outputs=subdivision,
                                            axis=0)

            if subdivision == 1:
                image_split = [image_split]
                heatmap_split = [heatmap_split]
                offset_target_split = [offset_target_split]
                wh_target_split = [wh_target_split]
                mask_target_split = [mask_target_split]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                heatmap_all_losses = []
                offset_all_losses = []
                wh_all_losses = []

                for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip(
                        image_split, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):

                    if GPU_COUNT <= 1:
                        image_part = gluon.utils.split_and_load(
                            image_part, [ctx], even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, [ctx], even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, [ctx], even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, [ctx], even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, [ctx], even_split=False)
                    else:
                        image_part = gluon.utils.split_and_load(
                            image_part, ctx, even_split=False)
                        heatmap_part = gluon.utils.split_and_load(
                            heatmap_part, ctx, even_split=False)
                        offset_target_part = gluon.utils.split_and_load(
                            offset_target_part, ctx, even_split=False)
                        wh_target_part = gluon.utils.split_and_load(
                            wh_target_part, ctx, even_split=False)
                        mask_target_part = gluon.utils.split_and_load(
                            mask_target_part, ctx, even_split=False)

                    # prediction, target space for Data Parallelism
                    heatmap_losses = []
                    offset_losses = []
                    wh_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, heatmap_target, offset_target, wh_target, mask_target in zip(
                            image_part, heatmap_part, offset_target_part,
                            wh_target_part, mask_target_part):
                        heatmap_pred, offset_pred, wh_pred = net(img)
                        heatmap_loss = heatmapfocalloss(
                            heatmap_pred, heatmap_target)
                        offset_loss = normedl1loss(offset_pred, offset_target,
                                                   mask_target) * lambda_off
                        wh_loss = normedl1loss(wh_pred, wh_target,
                                               mask_target) * lambda_size

                        heatmap_losses.append(heatmap_loss.asscalar())
                        offset_losses.append(offset_loss.asscalar())
                        wh_losses.append(wh_loss.asscalar())

                        total_loss.append(heatmap_loss + offset_loss + wh_loss)

                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    heatmap_all_losses.append(sum(heatmap_losses))
                    offset_all_losses.append(sum(offset_losses))
                    wh_all_losses.append(sum(wh_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기

            for p in net.collect_params().values():
                p.zero_grad()

            heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size
            offset_loss_sum += sum(offset_all_losses) / td_batch_size
            wh_loss_sum += sum(wh_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]'
                    f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]'
                    f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]')
            time_stamp = time.time()

        train_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                            train_update_number_per_epoch)
        train_offset_loss_mean = np.divide(offset_loss_sum,
                                           train_update_number_per_epoch)
        train_wh_loss_mean = np.divide(wh_loss_sum,
                                       train_update_number_per_epoch)
        train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean

        logging.info(
            f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % eval_period == 0 and valid_list:

            heatmap_loss_sum = 0
            offset_loss_sum = 0
            wh_loss_sum = 0

            # loss 구하기
            for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader:
                vd_batch_size = image.shape[0]

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, [ctx], even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, [ctx], even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, [ctx], even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, [ctx], even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)
                    heatmap_split = gluon.utils.split_and_load(
                        heatmap_all, ctx, even_split=False)
                    offset_target_split = gluon.utils.split_and_load(
                        offset_target_all, ctx, even_split=False)
                    wh_target_split = gluon.utils.split_and_load(
                        wh_target_all, ctx, even_split=False)
                    mask_target_split = gluon.utils.split_and_load(
                        mask_target_all, ctx, even_split=False)

                # prediction, target space for Data Parallelism
                heatmap_losses = []
                offset_losses = []
                wh_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip(
                        image, label, heatmap_split, offset_target_split,
                        wh_target_split, mask_target_split):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)

                    id, score, bbox = prediction(heatmap_pred, offset_pred,
                                                 wh_pred)
                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box * scale_factor,
                                            gt_labels=gt_id)

                    heatmap_loss = heatmapfocalloss(heatmap_pred,
                                                    heatmap_target)
                    offset_loss = normedl1loss(offset_pred, offset_target,
                                               mask_target) * lambda_off
                    wh_loss = normedl1loss(wh_pred, wh_target,
                                           mask_target) * lambda_size

                    heatmap_losses.append(heatmap_loss.asscalar())
                    offset_losses.append(offset_loss.asscalar())
                    wh_losses.append(wh_loss.asscalar())

                heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size
                offset_loss_sum += sum(offset_losses) / vd_batch_size
                wh_loss_sum += sum(wh_losses) / vd_batch_size

            valid_heatmap_loss_mean = np.divide(heatmap_loss_sum,
                                                valid_update_number_per_epoch)
            valid_offset_loss_mean = np.divide(offset_loss_sum,
                                               valid_update_number_per_epoch)
            valid_wh_loss_mean = np.divide(wh_loss_sum,
                                           valid_update_number_per_epoch)
            valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean

            logging.info(
                f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _, _, _ = next(dataloader_iter)

                if GPU_COUNT <= 1:
                    image = gluon.utils.split_and_load(image, [ctx],
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label, [ctx],
                                                       even_split=False)
                else:
                    image = gluon.utils.split_and_load(image,
                                                       ctx,
                                                       even_split=False)
                    label = gluon.utils.split_and_load(label,
                                                       ctx,
                                                       even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 0, 1)

                batch_image = []
                heatmap_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    heatmap_pred, offset_pred, wh_pred = net(img)
                    ids, scores, bboxes = prediction(heatmap_pred, offset_pred,
                                                     wh_pred)

                    for ig, gt_id, gt_box, heatmap, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, heatmap_pred, ids, scores,
                            bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)

                        # heatmap 그리기
                        heatmap = mx.nd.multiply(heatmap,
                                                 255.0)  # 0 ~ 255 범위로 바꾸기
                        heatmap = mx.nd.max(
                            heatmap, axis=0,
                            keepdims=True)  # channel 축으로 가장 큰것 뽑기
                        heatmap = mx.nd.transpose(
                            heatmap,
                            axes=(1, 2, 0))  # (height, width, channel=1)
                        heatmap = mx.nd.repeat(
                            heatmap, repeats=3,
                            axis=-1)  # (height, width, channel=3)
                        heatmap = heatmap.asnumpy(
                        )  # mxnet.ndarray -> numpy.ndarray
                        heatmap = cv2.resize(heatmap,
                                             dsize=(input_size[1],
                                                    input_size[0]))  # 사이즈 원복
                        heatmap = heatmap.astype("uint8")  # float32 -> uint8
                        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
                        heatmap[:, :,
                                (0, 1, 2)] = heatmap[:, :,
                                                     (2, 1, 0)]  # BGR -> RGB
                        heatmap = np.transpose(
                            heatmap,
                            axes=(2, 0, 1))  # (channel=3, height, width)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box * scale_factor,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=True,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = cv2.cvtColor(prediction_box,
                                                      cv2.COLOR_BGR2RGB)
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)
                        heatmap_image.append(heatmap)

                all_image = np.concatenate(
                    [np.array(batch_image),
                     np.array(heatmap_image)], axis=-1)
                summary.add_image(tag="valid_result",
                                  image=all_image,
                                  global_step=i)
                summary.add_scalar(tag="heatmap_loss",
                                   value={
                                       "train_heatmap_loss_mean":
                                       train_heatmap_loss_mean,
                                       "valid_heatmap_loss_mean":
                                       valid_heatmap_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="offset_loss",
                                   value={
                                       "train_offset_loss_mean":
                                       train_offset_loss_mean,
                                       "valid_offset_loss_mean":
                                       valid_offset_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="wh_loss",
                                   value={
                                       "train_wh_loss_mean":
                                       train_wh_loss_mean,
                                       "valid_wh_loss_mean": valid_wh_loss_mean
                                   },
                                   global_step=i)

                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                params = net.collect_params().values()
                if GPU_COUNT > 1:
                    for c in ctx:
                        for p in params:
                            summary.add_histogram(tag=p.name,
                                                  values=p.data(ctx=c),
                                                  global_step=i,
                                                  bins='default')
                else:
                    for p in params:
                        summary.add_histogram(tag=p.name,
                                              values=p.data(),
                                              global_step=i,
                                              bins='default')

        if i % save_period == 0:

            if not os.path.exists(weight_path):
                os.makedirs(weight_path)
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)

            postnet = PostNet(net=net, auxnet=prediction)  # 새로운 객체가 생성
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))
Ejemplo n.º 25
0
def train_net(net, train_iter, valid_iter, batch_size, trainer, ctx,
              num_epochs, lr_sch, save_prefix):
    logger.info("===================START TRAINING====================")
    if use_mxboard:
        sw = SummaryWriter(logdir='logs', flush_secs=5)
    cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
    cls_acc = mx.metric.Accuracy(name="train acc")
    top_acc = 0
    iter_num = 0
    #test_acc,test_loss = test_net(net, valid_iter, ctx)
    #sw.add_graph(net) #only hybrid block supported
    param_names = net.collect_params().keys()
    for epoch in range(num_epochs):
        train_loss = []
        t0 = time.time()
        if isinstance(train_iter, mx.io.MXDataIter):
            train_iter.reset()
        total = 0
        trainer.set_learning_rate(lr_sch(epoch))
        for batch in train_iter:
            iter_num += 1
            # print("iter ",iter_num," start")
            if isinstance(batch, mx.io.DataBatch):
                X, Y = batch.data[0], batch.label[0]
                #total += X.shape[0]
                #print(total)
            else:
                X, Y = batch
            #print(X.shape,Y.shape)
            #print(Y)
            X = X.as_in_context(ctx)
            Y = Y.as_in_context(ctx)
            with autograd.record(True):
                out = net(X)
                #out = out.as_in_context(mx.cpu())
                loss = cls_loss(out, Y)
        # print(out.asnumpy()[0])
        # print('loss = ',loss.sum().asscalar())
            loss.backward()
            train_loss.append(loss.sum().asscalar())
            trainer.step(batch_size)
            cls_acc.update(Y, out)
            nd.waitall()
            #print("iter ",iter_num," end")
            if use_mxboard:
                if iter_num % 100 == 0:
                    sw.add_scalar(tag='train_loss',
                                  value=loss.mean().asscalar(),
                                  global_step=iter_num)
                    sw.add_scalar(tag='train_acc',
                                  value=cls_acc.get(),
                                  global_step=iter_num)
                if iter_num % 100 == 0:
                    for name in net.collect_params():
                        param = net.collect_params()[name]
                        if param.grad_req != "null":
                            sw.add_histogram(tag=name,
                                             values=param.grad(),
                                             global_step=iter_num,
                                             bins=1000)

        logger.info("epoch {} lr {} {}sec".format(epoch, trainer.learning_rate,
                                                  time.time() - t0))
        train_loss, train_acc = np.mean(train_loss) / batch_size, cls_acc.get()
        logger.info("\ttrain loss {} {}".format(train_loss, train_acc))
        if epoch > 0 and (epoch % 10) == 0:
            test_acc, test_loss = test_net(net, valid_iter, ctx)
            if use_mxboard:
                sw.add_scalar(tag='test_acc',
                              value=test_acc,
                              global_step=epoch)
                sw.add_scalar(tag='test_loss',
                              value=test_loss,
                              global_step=epoch)
            if top_acc < test_acc:
                top_acc = test_acc
                logger.info('\ttop valid acc {}'.format(test_acc))
                if isinstance(net, mx.gluon.nn.HybridSequential) or isinstance(
                        net, mx.gluon.nn.HybridBlock):
                    pf = '{}_{:.3f}.params'.format(save_prefix, top_acc)
                    net.export(pf, epoch)
                else:
                    net_path = '{}top_acc_{}_{:.3f}.params'.format(
                        save_prefix, epoch, top_acc)
                    net.save_parameters(net_path)

    if use_mxboard:
        sw.close()
Ejemplo n.º 26
0
def mytrain(net,num_classes,train_data,valid_data,ctx,start_epoch, end_epoch, \
            arm_cls_loss=arm_cls_loss,cls_loss=cls_loss,box_loss=box_loss,trainer=None):
    if trainer is None:
        # trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01,'momentum':0.9, 'wd':50.0})
        trainer = gluon.Trainer(net.collect_params(), 'adam', {
            'learning_rate': 0.001,
            'clip_gradient': 2.0
        })
        # trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.003})
    box_metric = metric.MAE()

    ## add visible
    # collect parameter names for logging the gradients of parameters in each epoch
    params = net.collect_params()
    # param_names = params.keys()
    # define a summary writer that logs data and flushes to the file every 5 seconds
    sw = SummaryWriter(logdir='./logs', flush_secs=5)
    global_step = 0

    for e in range(start_epoch, end_epoch):
        # print(e)
        train_data.reset()
        valid_data.reset()
        box_metric.reset()
        tic = time.time()
        _loss = [0, 0]
        arm_loss = [0, 0]
        # if e == 6 or e == 100:
        #     trainer.set_learning_rate(trainer.learning_rate * 0.2)

        outs, labels = None, None
        for i, batch in enumerate(train_data):
            # print('----- batch {} start ----'.format(i))
            data = batch.data[0].as_in_context(ctx)
            label = batch.label[0].as_in_context(ctx)
            # print('label shape: ',label.shape)
            with autograd.record():
                # 1. generate results according to extract network
                ssd_layers = net(data)
                arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = multibox_layer(ssd_layers,\
                                                                            num_classes,sizes,ratios,normalizations)
                # arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = net(data)
                # print('---------1111-----------')
                # 2. ARM predict
                ## 2.1  modify label as [-1,0,..]
                label_arm = nd.Custom(label, op_type='modify_label')
                arm_tmp = MultiBoxTarget(arm_anchor_boxes,label_arm,arm_cls_preds,overlap_threshold=.5,\
                                         negative_mining_ratio=3,negative_mining_thresh=.5)
                arm_loc_target = arm_tmp[0]  # box offset
                arm_loc_target_mask = arm_tmp[1]  # box mask (only 0,1)
                arm_cls_target = arm_tmp[2]  #  every anchor' idx
                # print(sum(arm_cls_target[0]))
                # print('---------2222-----------')

                # 3. ODM predict
                ## 3.1 refine anchor generator originate in ARM
                odm_anchor_boxes = refine_anchor_generator(
                    arm_anchor_boxes,
                    arm_loc_preds)  #(batch,h*w*num_anchors[:layers],4)
                # ### debug backward err
                # odm_anchor_boxes = arm_anchor_boxes
                odm_anchor_boxes_bs = nd.split(
                    data=odm_anchor_boxes, axis=0,
                    num_outputs=label.shape[0])  # list
                # print('---3 : odm_anchor_boxes_bs shape : {}'.format(odm_anchor_boxes_bs[0].shape))
                # print('---------3333-----------')
                ## 3.2 对当前所有batch的data计算 Target (多个gpu使用)

                odm_loc_target = []
                odm_loc_target_mask = []
                odm_cls_target = []
                label_bs = nd.split(data=label,
                                    axis=0,
                                    num_outputs=label.shape[0])
                odm_cls_preds_bs = nd.split(data=odm_cls_preds,
                                            axis=0,
                                            num_outputs=label.shape[0])
                # print('---4 : odm_cls_preds_bs shape: {}'.format(odm_cls_preds_bs[0].shape))
                # print('---4 : label_bs shape: {}'.format(label_bs[0].shape))

                for j in range(label.shape[0]):
                    if label.shape[0] == 1:
                        odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j].expand_dims(axis=0),label_bs[j].expand_dims(axis=0),\
                                            odm_cls_preds_bs[j].expand_dims(axis=0),overlap_threshold=.5,negative_mining_ratio=2,negative_mining_thresh=.5)
                    ## 多个batch
                    else:
                        odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j],label_bs[j],\
                                            odm_cls_preds_bs[j],overlap_threshold=.5,negative_mining_ratio=3,negative_mining_thresh=.5)
                    odm_loc_target.append(odm_tmp[0])
                    odm_loc_target_mask.append(odm_tmp[1])
                    odm_cls_target.append(odm_tmp[2])
                ### concat ,上面为什么会单独计算每张图,odm包含了batch,so需要拆
                odm_loc_target = nd.concat(*odm_loc_target, dim=0)
                odm_loc_target_mask = nd.concat(*odm_loc_target_mask, dim=0)
                odm_cls_target = nd.concat(*odm_cls_target, dim=0)

                # 4. negitave filter
                group = nd.Custom(arm_cls_preds,
                                  odm_cls_target,
                                  odm_loc_target_mask,
                                  op_type='negative_filtering')
                odm_cls_target = group[0]  #用ARM中的cls过滤后的odm_cls
                odm_loc_target_mask = group[1]  #过滤掉的mask为0
                # print('---------4444-----------')
                # 5. calc loss
                # TODO:add 1/N_arm, 1/N_odm (num of positive anchors)
                # arm_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
                arm_loss_cls = arm_cls_loss(arm_cls_preds.transpose((0, 2, 1)),
                                            arm_cls_target)
                arm_loss_loc = box_loss(arm_loc_preds, arm_loc_target,
                                        arm_loc_target_mask)
                # print('55555 loss->  arm_loss_cls : {} arm_loss_loc {}'.format(arm_loss_cls.shape,arm_loss_loc.shape))
                # print('arm_loss_cls loss : {}'.format(arm_loss_cls))
                # odm_cls_prob = nd.softmax(odm_cls_preds,axis=2)
                tmp = odm_cls_preds.transpose((0, 2, 1))
                odm_loss_cls = cls_loss(odm_cls_preds.transpose((0, 2, 1)),
                                        odm_cls_target)
                odm_loss_loc = box_loss(odm_loc_preds, odm_loc_target,
                                        odm_loc_target_mask)
                # print('66666 loss->  odm_loss_cls : {} odm_loss_loc {}'.format(odm_loss_cls.shape,odm_loss_loc.shape))
                # print('odm_loss_cls loss :{} '.format(odm_loss_cls))
                # print('odm_loss_loc loss :{} '.format(odm_loss_loc))
                # print('N_arm: {} ; N_odm: {} '.format(nd.sum(arm_loc_target_mask,axis=1)/4.0,nd.sum(odm_loc_target_mask,axis=1)/4.0))
                # loss = arm_loss_cls+arm_loss_loc+odm_loss_cls+odm_loss_loc
                loss = 1/(nd.sum(arm_loc_target_mask,axis=1)/4.0) *(arm_loss_cls+arm_loss_loc) + \
                        1/(nd.sum(odm_loc_target_mask,axis=1)/4.0)*(odm_loss_cls+odm_loss_loc)

            sw.add_scalar(tag='loss',
                          value=loss.mean().asscalar(),
                          global_step=global_step)
            global_step += 1
            loss.backward(retain_graph=False)
            # autograd.backward(loss)
            # print(net.collect_params().get('conv4_3_weight').data())
            # print(net.collect_params().get('vgg0_conv9_weight').grad())
            ### 单独测试梯度
            # arm_loss_cls.backward(retain_graph=False)
            # arm_loss_loc.backward(retain_graph=False)
            # odm_loss_cls.backward(retain_graph=False)
            # odm_loss_loc.backward(retain_graph=False)

            trainer.step(data.shape[0])
            _loss[0] += nd.mean(odm_loss_cls).asscalar()
            _loss[1] += nd.mean(odm_loss_loc).asscalar()
            arm_loss[0] += nd.mean(arm_loss_cls).asscalar()
            arm_loss[1] += nd.mean(arm_loss_loc).asscalar()
            # print(arm_loss)
            arm_cls_prob = nd.SoftmaxActivation(arm_cls_preds, mode='channel')
            odm_cls_prob = nd.SoftmaxActivation(odm_cls_preds, mode='channel')
            out = MultiBoxDetection(odm_cls_prob,odm_loc_preds,odm_anchor_boxes,\
                                        force_suppress=True,clip=False,nms_threshold=.5,nms_topk=400)
            # print('out shape: {}'.format(out.shape))
            if outs is None:
                outs = out
                labels = label
            else:
                outs = nd.concat(outs, out, dim=0)
                labels = nd.concat(labels, label, dim=0)
            box_metric.update([odm_loc_target],
                              [odm_loc_preds * odm_loc_target_mask])
        print('-------{} epoch end ------'.format(e))
        train_AP = evaluate_MAP(outs, labels)
        valid_AP, val_box_metric = evaluate_acc(net, valid_data, ctx)
        info["train_ap"].append(train_AP)
        info["valid_ap"].append(valid_AP)
        info["loss"].append(_loss)
        print('odm loss: ', _loss)
        print('arm loss: ', arm_loss)
        if e == 0:
            sw.add_graph(net)
        # grads = [i.grad() for i in net.collect_params().values()]
        # grads_4_3 = net.collect_params().get('vgg0_conv9_weight').grad()
        # sw.add_histogram(tag ='vgg0_conv9_weight',values=grads_4_3,global_step=e, bins=1000 )
        grads_4_2 = net.collect_params().get('vgg0_conv5_weight').grad()
        sw.add_histogram(tag='vgg0_conv5_weight',
                         values=grads_4_2,
                         global_step=e,
                         bins=1000)
        # assert len(grads) == len(param_names)
        # logging the gradients of parameters for checking convergence
        # for i, name in enumerate(param_names):
        #     sw.add_histogram(tag=name, values=grads[i], global_step=e, bins=1000)

        # net.export('./Model/RefineDet_MeterDetect') # net
        if (e + 1) % 5 == 0:
            print(
                "epoch: %d time: %.2f cls loss: %.4f,reg loss: %.4f lr: %.5f" %
                (e, time.time() - tic, _loss[0], _loss[1],
                 trainer.learning_rate))
            print("train mae: %.4f AP: %.4f" % (box_metric.get()[1], train_AP))
            print("valid mae: %.4f AP: %.4f" %
                  (val_box_metric.get()[1], valid_AP))
        sw.add_scalar(tag='train_AP', value=train_AP, global_step=e)
        sw.add_scalar(tag='valid_AP', value=valid_AP, global_step=e)
    sw.close()
    if True:
        info["loss"] = np.array(info["loss"])
        info["cls_loss"] = info["loss"][:, 0]
        info["box_loss"] = info["loss"][:, 1]

        plt.figure(figsize=(12, 4))
        plt.subplot(121)
        plot("train_ap")
        plot("valid_ap")
        plt.legend(loc="upper right")
        plt.subplot(122)
        plot("cls_loss")
        plot("box_loss")
        plt.legend(loc="upper right")
        plt.savefig('loss_curve.png')
Ejemplo n.º 27
0
def train():
    # load_data
    batch_size = args.batch_size * max(args.num_gpus, 1)
    train_set = gluon.data.vision.CIFAR10(train=True,
                                          transform=transform_train)
    train_data = DataLoader(train_set,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=args.num_workers,
                            last_batch='discard')
    val_set = gluon.data.vision.CIFAR10(train=False, transform=transform_val)
    val_data = DataLoader(val_set,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=args.num_workers)

    # set the network and trainer
    ctx = [mx.gpu(i)
           for i in range(args.num_gpus)] if args.num_gpus > 0 else [mx.cpu()]
    net = get_attention_cifar(10, num_layers=args.num_layers)
    net.initialize(init=mx.initializer.MSRAPrelu(), ctx=ctx)
    net.hybridize()

    trainer = gluon.Trainer(net.collect_params(), 'sgd', {
        'learning_rate': args.lr,
        'momentum': args.momentum,
        'wd': args.wd
    })
    cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss(
        sparse_label=not use_mix_up)
    train_metric = mtc.Accuracy() if not use_mix_up else mx.metric.RMSE()

    # set log output
    train_mode = 'MixUP' if use_mix_up else 'Vanilla'
    logger = logging.getLogger('TRAIN')
    logger.setLevel("INFO")
    logger.addHandler(logging.StreamHandler())
    logger.addHandler(
        logging.FileHandler(
            os.path.join(
                args.log_dir, 'text/cifar10_attention%d_%s_%s.log' %
                (args.num_layers, train_mode,
                 datetime.strftime(datetime.now(), '%Y%m%d%H%M')))))
    sw = SummaryWriter(logdir=os.path.join(
        args.log_dir, 'board/cifar10_attention%d_%s_%s' %
        (args.num_layers, train_mode,
         datetime.strftime(datetime.now(), '%Y%m%d%H%M'))),
                       verbose=False)

    # record the training hyper parameters
    logger.info(args)
    lr_counter = 0
    lr_steps = [int(s) for s in args.lr_steps.strip().split(',')]
    num_batch = len(train_data)
    epochs = args.epochs + 1
    alpha = args.alpha
    max_accuracy = 0.9

    for epoch in range(epochs):
        if epoch == lr_steps[lr_counter]:
            trainer.set_learning_rate(trainer.learning_rate * 0.1)
            if lr_counter + 1 < len(lr_steps):
                lr_counter += 1
        train_loss = 0
        train_metric.reset()
        tic = time.time()
        for i, batch in enumerate(train_data):
            data = gluon.utils.split_and_load(batch[0],
                                              ctx_list=ctx,
                                              batch_axis=0,
                                              even_split=False)
            labels = gluon.utils.split_and_load(batch[1],
                                                ctx_list=ctx,
                                                batch_axis=0,
                                                even_split=False)

            if use_mix_up and epoch < epochs - 20:
                lam = np.random.beta(alpha, alpha)
                data = [lam * X + (1 - lam) * X[::-1] for X in data]
                labels = [lam * Y + (1 - lam) * Y[::-1] for Y in labels]

            with ag.record():
                outputs = [net(X) for X in data]
                losses = [
                    cross_entropy(yhat, y) for yhat, y in zip(outputs, labels)
                ]

            for l in losses:
                ag.backward(l)

            trainer.step(batch_size)
            train_metric.update(labels, outputs)

            train_loss += sum([l.mean().asscalar()
                               for l in losses]) / len(losses)

        _, train_acc = train_metric.get()
        train_loss /= num_batch
        val_acc, val_loss = validate(net, val_data, ctx)

        sw.add_scalar("AttentionNet/Loss", {
            'train': train_loss,
            'val': val_loss
        }, epoch)
        sw.add_scalar("AttentionNet/Metric", {
            'train': train_acc,
            'val': val_acc
        }, epoch)
        logger.info('[Epoch %d] train metric: %.6f, train loss: %.6f | '
                    'val accuracy: %.6f, val loss: %.6f, time: %.1f' %
                    (epoch, train_acc, train_loss, val_acc, val_loss,
                     time.time() - tic))

        if (epoch % args.save_period) == 0 and epoch != 0:
            net.save_parameters(
                "./models/attention%d-cifar10-epoch-%d-%s.params" %
                (args.num_layers, epoch, train_mode))

        if val_acc > max_accuracy:
            net.save_parameters(
                "./models/best-%f-attention%d-cifar10-epoch-%d-%s.params" %
                (val_acc, args.num_layers, epoch, train_mode))
            max_accuracy = val_acc

    sw.close()
    logger.info("Train End.")
Ejemplo n.º 28
0
                loss = loss_fun(y_hat, label_i)
            loss.backward()
            trainer.step(batch_size=batch_size)
            train_loss += loss.mean().asscalar()
            f_train_acc += acc(y_hat, label_i)

        for batch_data in test_data:
            data = batch_data[0].as_in_context(ctx)
            label = batch_data[1].as_in_context(ctx)
            f_val_acc += acc(mnist_net(data), label)

        f_train_acc = f_train_acc * ninv_train
        f_val_acc = f_val_acc * ninv_test

        sw.add_scalar(tag="RMNIST-07_training_accuracy",
                      value=f_train_acc,
                      global_step=a_epoch)
        sw.add_scalar(tag="MNIST-07_validation_accuracy",
                      value=f_val_acc,
                      global_step=a_epoch)
        #
        print(
            "Epoch %d: Loss: %.3f, Train acc %.3f, Test acc %.3f, Time %.1f sec"
            % (a_epoch, train_loss / len(train_data), f_train_acc, f_val_acc,
               time() - tic))

# requires either full layer specification or we pass one batch of data through the net
# so that deferred initialization can work it's magic

no_in_channel = 50
no_out_channel = 100
Ejemplo n.º 29
0
def train():
    """training"""
    image_pool = ImagePool(pool_size)
    metric = mx.metric.CustomMetric(facc)

    stamp = datetime.now().strftime('%Y_%m_%d-%H_%M')
    logging.basicConfig(level=logging.DEBUG)

    # define a summary writer that logs data and flushes to the file every 5 seconds
    sw = SummaryWriter(logdir='%s' % dir_out_sw, flush_secs=5, verbose=False)
    global_step = 0

    for epoch in range(epochs):
        if epoch == 0:
            netG.hybridize()
            netD.hybridize()
        #     sw.add_graph(netG)
        #     sw.add_graph(netD)

        tic = time.time()
        btic = time.time()
        train_data.reset()
        val_data.reset()
        iter = 0
        for local_step, batch in enumerate(train_data):
            ############################
            # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z)))
            ###########################
            tmp = mx.nd.concat(batch.data[0],
                               batch.data[1],
                               batch.data[2],
                               dim=1)
            tmp = augmenter(tmp,
                            patch_size=128,
                            offset=offset,
                            aug_type=1,
                            aug_methods=aug_methods,
                            random_crop=False)
            real_in = tmp[:, :1].as_in_context(ctx)
            real_out = tmp[:, 1:2].as_in_context(ctx)
            m = tmp[:, 2:3].as_in_context(ctx)  # mask

            fake_out = netG(real_in) * m

            # loss weight based on mask, applied on L1 loss
            if no_loss_weights:
                loss_weight = m
            else:
                loss_weight = m.asnumpy()
                loss_weight[loss_weight == 0] = .1
                loss_weight = mx.nd.array(loss_weight, ctx=m.context)

            fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1))
            with autograd.record():
                # Train with fake image
                # Use image pooling to utilize history images
                output = netD(fake_concat)
                fake_label = nd.zeros(output.shape, ctx=ctx)
                errD_fake = GAN_loss(output, fake_label)
                metric.update([
                    fake_label,
                ], [
                    output,
                ])

                # Train with real image
                real_concat = nd.concat(real_in, real_out, dim=1)
                output = netD(real_concat)
                real_label = nd.ones(output.shape, ctx=ctx)
                errD_real = GAN_loss(output, real_label)
                errD = (errD_real + errD_fake) * 0.5
                errD.backward()
                metric.update([
                    real_label,
                ], [
                    output,
                ])

            trainerD.step(batch.data[0].shape[0])

            ############################
            # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z))
            ###########################
            with autograd.record():
                fake_out = netG(real_in)
                fake_concat = nd.concat(real_in, fake_out, dim=1)
                output = netD(fake_concat)
                real_label = nd.ones(output.shape, ctx=ctx)
                errG = GAN_loss(output, real_label) + loss_2nd(
                    real_out, fake_out, loss_weight) * lambda1
                errG.backward()

            trainerG.step(batch.data[0].shape[0])

            sw.add_scalar(tag='loss',
                          value=('d_loss', errD.mean().asscalar()),
                          global_step=global_step)
            sw.add_scalar(tag='loss',
                          value=('g_loss', errG.mean().asscalar()),
                          global_step=global_step)
            global_step += 1

            if epoch + local_step == 0:
                sw.add_graph((netG))
                img_in_list, img_out_list, m_val = val_data.next().data
                m_val = m_val.as_in_context(ctx)
                sw.add_image('first_minibatch_train_real', norm3(real_out))
                sw.add_image('first_minibatch_val_real',
                             norm3(img_out_list.as_in_context(ctx)))
                netG.export('%snetG' % dir_out_checkpoints)
            if local_step == 0:
                # Log the first batch of images of each epoch (training)
                sw.add_image('first_minibatch_train_fake',
                             norm3(fake_out * m) * m, epoch)
                sw.add_image(
                    'first_minibatch_val_fake',
                    norm3(netG(img_in_list.as_in_context(ctx)) * m_val) *
                    m_val, epoch)
                # norm3(netG(img_in_list.as_in_context(ctx)) * m_val.as_in_context(ctx)), epoch)

            if (iter + 1) % 10 == 0:
                name, acc = metric.get()

                logging.info('speed: {} samples/s'.format(
                    batch_size / (time.time() - btic)))
                logging.info(
                    'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d'
                    % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc,
                       iter, epoch))

            iter += 1
            btic = time.time()

        sw.add_scalar(tag='binary_training_acc',
                      value=('acc', acc),
                      global_step=epoch)

        name, acc = metric.get()
        metric.reset()

        fake_val = netG(val_data.data[0][1].as_in_context(ctx))
        loss_val = loss_2nd(val_data.data[1][1].as_in_context(ctx), fake_val,
                            val_data.data[2][1].as_in_context(ctx)) * lambda1
        sw.add_scalar(tag='loss_val',
                      value=('g_loss', loss_val.mean().asscalar()),
                      global_step=epoch)

        if (epoch % check_point_interval == 0) | (epoch == epochs - 1):
            netD.save_params('%snetD-%04d' % (dir_out_checkpoints, epoch))
            netG.save_params('%snetG-%04d' % (dir_out_checkpoints, epoch))

        logging.info('\nbinary training acc at epoch %d: %s=%f' %
                     (epoch, name, acc))
        logging.info('time: %f' % (time.time() - tic))

    sw.export_scalars('scalar_dict.json')
    sw.close()
Ejemplo n.º 30
0
class Segmentation(object):
    def __init__(self, args):

        self.args = args
        self.lr = 0.0005
        self.device = [
            mx.gpu(0)
        ]  #([mx.gpu(0), mx.gpu(1)] if not args.cpu else [mx.cpu(0)]) if args.train else [mx.gpu(0)]
        self.test_device = [mx.gpu(0)]
        self.lr_decay_step = 5000
        # self.lr_decay_epoch = 2
        self.lr_decay_rate_epoch = 0.9
        self.lr_decay_rate = 0.9
        self.save_freq = 1  # epoch
        self.save_freq_step = 5000
        self.arch_name = config['arch_name']
        self.arch_path = os.path.join('model', self.arch_name)
        self.mode = 'train' if self.args.train else 'test'
        self.batch_size = config[self.mode]['batch_size']
        self.config = config
        self.segmentator = Segmentator(self.args, self.config)
        self.cgeddata = CGEDData(self.segmentator.tokenizer,
                                 self.segmentator.transformer,
                                 self.config,
                                 self.mode,
                                 useDecoder=self.args.decoder,
                                 args=args)

        self.csc15data = SighanCSC15Data(self.segmentator.tokenizer,
                                         self.segmentator.transformer,
                                         self.config,
                                         self.mode,
                                         args=args,
                                         vocab_tgt=self.segmentator.vocab_tgt)
        self.csc14data = SighanCSC14Data(self.segmentator.tokenizer,
                                         self.segmentator.transformer,
                                         self.config,
                                         self.mode,
                                         args=args,
                                         vocab_tgt=self.segmentator.vocab_tgt)

        if args.dataset != 'CGED16':
            self.reviewdata = ReviewData(self.segmentator.tokenizer,
                                         self.segmentator.transformer,
                                         self.segmentator.vocab_tgt,
                                         self.config, self.mode)
            self.reviewdata_val = ReviewData(self.segmentator.tokenizer,
                                             self.segmentator.transformer,
                                             self.segmentator.vocab_tgt,
                                             self.config, 'test')

    def init_network(self):
        self.segmentator.initialize(mx.init.Xavier(), ctx=self.device)

        if self.args.dataset != 'CGED16':
            emb_pretrained = self.segmentator._collect_params_with_prefix(
            )['encoder.word_embed.0.weight'].data()[:len(self.segmentator.
                                                         vocab_tgt)]
            self.segmentator.collect_params().reset_ctx(self.device)
            self.segmentator._collect_params_with_prefix(
            )['emb_tgt.0.weight']._load_init(emb_pretrained, self.device)
        else:
            self.segmentator.collect_params().reset_ctx(self.device)

        if self.args.pretrain:
            return load_pretrained_model_only_same_shape(
                self.segmentator, 'model/bert_multi_full/0000140000-0.params',
                self.device)
        else:
            return load_latest_checkpoint(self.segmentator, self.arch_path,
                                          self.device)
        # if os.path.isdir('model/bert'):

    def init_trainer(self, step_start):
        self.lr_scheduler = mx.lr_scheduler.FactorScheduler(
            self.lr_decay_step, self.lr_decay_rate)
        self.optimizer = 'lamb'
        self.options = {
            'learning_rate':
            self.lr * (self.lr_decay_rate_epoch**int(
                step_start / float(self.lr_decay_step))),
            'lr_scheduler':
            self.lr_scheduler,
            'clip_gradient':
            0.1,
            # 'momentum' : 0.9,
            'wd':
            0.0001
        }

        self.trainer = mx.gluon.Trainer(self.segmentator.collect_params(),
                                        self.optimizer, self.options)

    def train(self):

        step_start, _ = self.init_network()

        self.init_trainer(step_start)

        self.sw = SummaryWriter(logdir='logs/' + self.config['arch_name'],
                                flush_secs=5)

        # self.cgedloader = self.cgeddata.get_loader()
        self.reviewloader = self.reviewdata.get_loader()
        # self.csc15loader = self.csc15data.get_loader()
        # self.csc14loader = self.csc14data.get_loader()
        self.reviewloader_val = self.reviewdata_val.get_loader()

        # self.int_cged_samples = len(self.cgeddata.data)
        self.int_review_samples = len(self.reviewdata.data)
        # self.int_csc15_samples = len(self.csc15data.data)
        # self.int_csc14_samples = len(self.csc14data.data)

        self.segmentator.hybridize()

        list_input_texts_test = None
        list_target_texts_test = None

        # self.cged_enumerator = enumerate(self.cgedloader)
        # self.cged_epoch = 0
        self.review_epoch = 0
        # self.csc15_epoch = 0
        # self.csc14_epoch = 0
        self.review_enumerator = enumerate(self.reviewloader)
        # self.csc15_enumerator = enumerate(self.csc15loader)
        # self.csc14_enumerator = enumerate(self.csc14loader)
        self.review_enumerator_val = enumerate(self.reviewloader_val)

        # progress_cged = tqdm(total = self.int_cged_samples, desc = 'cged')
        progress_review = tqdm(total=self.int_review_samples, desc='review')
        # progress_csc15 = tqdm(total = self.int_csc15_samples, desc = 'csc15')
        # progress_csc14 = tqdm(total = self.int_csc14_samples, desc = 'csc14')

        # for intialize parameter in network for multi-task
        # i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment,  nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx, list_input_texts, list_target_texts) = next(self.review_enumerator, (-1, [None] * 11))
        # nd_input_word_idx = gluon.utils.split_and_load(nd_input_word_idx[:2], self.device)
        # nd_input_valid_len = gluon.utils.split_and_load(nd_input_valid_len[:2], self.device)
        # nd_input_segment = gluon.utils.split_and_load(nd_input_segment[:2], self.device)

        # nd_target_word_idx = gluon.utils.split_and_load(nd_target_word_idx[:2], self.device)
        # nd_target_valid_len = gluon.utils.split_and_load(nd_target_valid_len[:2], self.device)
        # nd_target_segment = gluon.utils.split_and_load(nd_target_segment[:2], self.device)
        # self.segmentator.initialize_full_network(nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, self.device)

        for _step in range(step_start, step_start + 30001):

            # try: # 為了解決 batch 剩 1 的問題,但是要 debug 時很難 debug
            _dataset = np.random.choice(['review', 'cged', 'csc15', 'csc14'],
                                        p=[1, 0, 0, 0])

            if (_step % self.config['val_freq'] == 0
                    or _step % 100 == 0) and _step != 0:
                _dataset = 'review'

            if _dataset == 'cged':
                # i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, list_ids, list_input_texts, list_target_texts) = self.cged_enumerator.__next__()
                i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment,
                    nd_error_idx, list_input_texts,
                    list_target_texts) = next(self.cged_enumerator,
                                              (-1, [None] * 6))
                if i == -1:
                    self.cgedloader = self.cgeddata.get_loader()
                    self.cged_epoch += 1
                    self.cged_enumerator = enumerate(self.cgedloader)
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_error_idx, list_input_texts,
                        list_target_texts) = next(self.cged_enumerator,
                                                  (-1, [None] * 6))
                    progress_cged = tqdm(total=self.int_cged_samples,
                                         desc='cged')
                _batch_size = int(nd_input_word_idx.shape[0] /
                                  len(self.device)) * len(self.device)

                nd_input_word_idx = gluon.utils.split_and_load(
                    nd_input_word_idx[:_batch_size], self.device)
                nd_input_valid_len = gluon.utils.split_and_load(
                    nd_input_valid_len[:_batch_size], self.device)
                nd_input_segment = gluon.utils.split_and_load(
                    nd_input_segment[:_batch_size], self.device)

                nd_error_idx = gluon.utils.split_and_load(
                    nd_error_idx[:_batch_size], self.device)

                loss = self.segmentator.train_CGED(nd_input_word_idx,
                                                   nd_input_valid_len,
                                                   nd_input_segment,
                                                   nd_error_idx, self.device,
                                                   _batch_size, self.trainer)
                progress_cged.update(_batch_size)

                if (_step % 100 == 0):
                    print('CGED Loss => {}, Epoch => {}, Lr => {}'.format(
                        loss, self.cged_epoch, self.trainer.learning_rate))

            elif _dataset == 'review':
                i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment,
                    nd_target_word_idx, nd_target_valid_len, nd_target_segment,
                    nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx,
                    list_input_texts,
                    list_target_texts) = next(self.review_enumerator,
                                              (-1, [None] * 11))
                if i == -1:
                    self.reviewloader = self.reviewdata.get_loader()
                    self.review_epoch += 1
                    self.review_enumerator = enumerate(self.reviewloader)
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx,
                        nd_target_valid_len, nd_target_segment,
                        nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx,
                        list_input_texts,
                        list_target_texts) = next(self.review_enumerator,
                                                  (-1, [None] * 11))
                    progress_review = tqdm(total=self.int_review_samples,
                                           desc='review')

                _batch_size = int(nd_input_word_idx.shape[0] /
                                  len(self.device)) * len(self.device)
                nd_input_word_idx = gluon.utils.split_and_load(
                    nd_input_word_idx[:_batch_size], self.device)
                nd_input_valid_len = gluon.utils.split_and_load(
                    nd_input_valid_len[:_batch_size], self.device)
                nd_input_segment = gluon.utils.split_and_load(
                    nd_input_segment[:_batch_size], self.device)

                nd_pm_error_idx = gluon.utils.split_and_load(
                    nd_pm_error_idx[:_batch_size], self.device)
                nd_pm_add_idx = gluon.utils.split_and_load(
                    nd_pm_add_idx[:_batch_size], self.device)
                nd_pm_remove_idx = gluon.utils.split_and_load(
                    nd_pm_remove_idx[:_batch_size], self.device)
                nd_target_word_idx = gluon.utils.split_and_load(
                    nd_target_word_idx[:_batch_size], self.device)
                nd_target_valid_len = gluon.utils.split_and_load(
                    nd_target_valid_len[:_batch_size], self.device)
                nd_target_segment = gluon.utils.split_and_load(
                    nd_target_segment[:_batch_size], self.device)

                loss, loss_pm = self.segmentator.train(
                    nd_input_word_idx, nd_input_valid_len, nd_input_segment,
                    nd_target_word_idx, nd_target_valid_len, nd_target_segment,
                    nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx,
                    list_input_texts[:_batch_size],
                    list_target_texts[:_batch_size], self.device, _batch_size,
                    self.trainer)

                if (_step % 100 == 0):

                    print('Review Loss => {}, Epoch => {}, Lr => {}'.format(
                        loss, self.review_epoch, self.trainer.learning_rate))
                    self.sw.add_scalar(tag='review_loss',
                                       value=loss,
                                       global_step=_step)
                    if self.config['use_encoder_constraint']:
                        self.sw.add_scalar(tag='pm_loss',
                                           value=loss_pm,
                                           global_step=_step)

                progress_review.update(_batch_size)

            elif _dataset == 'csc15':
                if not self.config['csc_fixed']:
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx,
                        nd_target_valid_len, nd_target_segment,
                        list_input_texts,
                        list_target_texts) = next(self.csc14_enumerator,
                                                  (-1, [None] * 8))
                else:
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx, list_input_texts,
                        list_target_texts) = next(self.csc14_enumerator,
                                                  (-1, [None] * 6))
                if i == -1:
                    self.csc15loader = self.csc15data.get_loader()
                    self.csc15_epoch += 1
                    self.csc15_enumerator = enumerate(self.csc15loader)
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx,
                        nd_target_valid_len, nd_target_segment,
                        list_input_texts,
                        list_target_texts) = next(self.csc15_enumerator,
                                                  (-1, [None] * 8))
                    progress_csc15 = tqdm(total=self.int_csc15_samples,
                                          desc='csc15')

                _batch_size = int(nd_input_word_idx.shape[0] /
                                  len(self.device)) * len(self.device)
                nd_input_word_idx = gluon.utils.split_and_load(
                    nd_input_word_idx[:_batch_size], self.device)
                nd_input_valid_len = gluon.utils.split_and_load(
                    nd_input_valid_len[:_batch_size], self.device)
                nd_input_segment = gluon.utils.split_and_load(
                    nd_input_segment[:_batch_size], self.device)

                nd_target_word_idx = gluon.utils.split_and_load(
                    nd_target_word_idx[:_batch_size], self.device)
                # nd_target_valid_len = gluon.utils.split_and_load(nd_target_valid_len[:_batch_size], self.device)
                # nd_target_segment = gluon.utils.split_and_load(nd_target_segment[:_batch_size], self.device)
                if not self.config['csc_fixed']:
                    nd_target_valid_len = gluon.utils.split_and_load(
                        nd_target_valid_len[:_batch_size], self.device)
                    nd_target_segment = gluon.utils.split_and_load(
                        nd_target_segment[:_batch_size], self.device)

                    loss = self.segmentator.train(
                        nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx,
                        nd_target_valid_len, nd_target_segment,
                        list_input_texts[:_batch_size],
                        list_target_texts[:_batch_size], self.device,
                        _batch_size, self.trainer)

                else:
                    loss = self.segmentator.train_csc_fixed(
                        nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx, self.device,
                        _batch_size, self.trainer)

                # loss = self.segmentator.train(nd_input_word_idx, nd_input_valid_len, nd_input_segment,
                #                        nd_target_word_idx, nd_target_valid_len, nd_target_segment,
                #list_input_texts[:_batch_size], list_target_texts[:_batch_size], self.device, _batch_size, self.trainer)

                if (_step % 100 == 0):

                    print('CSC15 Loss => {}, Epoch => {}, Lr => {}'.format(
                        loss, self.csc15_epoch, self.trainer.learning_rate))

                progress_csc15.update(_batch_size)
            #   print('Epoch {} Step {} => {}, LR : {}'.format(e, i, loss, self.trainer.learning_rate))
            elif _dataset == 'csc14':

                if not self.config['csc_fixed']:
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx,
                        nd_target_valid_len, nd_target_segment,
                        list_input_texts,
                        list_target_texts) = next(self.csc14_enumerator,
                                                  (-1, [None] * 8))
                else:
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx, list_input_texts,
                        list_target_texts) = next(self.csc14_enumerator,
                                                  (-1, [None] * 6))

                if i == -1:
                    self.csc14loader = self.csc14data.get_loader()
                    self.csc14_epoch += 1
                    self.csc14_enumerator = enumerate(self.csc14loader)
                    i, (nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx,
                        nd_target_valid_len, nd_target_segment,
                        list_input_texts,
                        list_target_texts) = next(self.csc14_enumerator,
                                                  (-1, [None] * 8))
                    progress_csc14 = tqdm(total=self.int_csc14_samples,
                                          desc='csc14')

                _batch_size = int(nd_input_word_idx.shape[0] /
                                  len(self.device)) * len(self.device)
                nd_input_word_idx = gluon.utils.split_and_load(
                    nd_input_word_idx[:_batch_size], self.device)
                nd_input_valid_len = gluon.utils.split_and_load(
                    nd_input_valid_len[:_batch_size], self.device)
                nd_input_segment = gluon.utils.split_and_load(
                    nd_input_segment[:_batch_size], self.device)

                nd_target_word_idx = gluon.utils.split_and_load(
                    nd_target_word_idx[:_batch_size], self.device)

                if not self.config['csc_fixed']:
                    nd_target_valid_len = gluon.utils.split_and_load(
                        nd_target_valid_len[:_batch_size], self.device)
                    nd_target_segment = gluon.utils.split_and_load(
                        nd_target_segment[:_batch_size], self.device)

                    loss = self.segmentator.train(
                        nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx,
                        nd_target_valid_len, nd_target_segment,
                        list_input_texts[:_batch_size],
                        list_target_texts[:_batch_size], self.device,
                        _batch_size, self.trainer)

                else:
                    loss = self.segmentator.train_csc_fixed(
                        nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_target_word_idx, self.device,
                        _batch_size, self.trainer)

                if (_step % 100 == 0):

                    print('CSC14 Loss => {}, Epoch => {}, Lr => {}'.format(
                        loss, self.csc14_epoch, self.trainer.learning_rate))

                progress_csc14.update(_batch_size)
            #   print('Epoch {} Step {} => {}, LR : {}'.format(e, i, loss, self.trainer.learning_rate))
            # except Exception as e:
            #   # print(e)
            #   continue
            # if e % self.save_freq == 0:
            #   save_gluon_model(self.segmentator, self.arch_path, e, 0) # use main     dataset
            if _step % self.save_freq_step == 0:
                save_gluon_model(self.segmentator, self.arch_path, _step,
                                 0)  # use main     dataset

            if _step % self.config['val_freq'] == 0 and _step != 0:
                # _, batch_test = self.review_enumerator_val.__next__()
                # list_input_texts_test = batch_test[0]
                # list_target_texts_test = batch_test[1]

                # print('Input => ', list_input_texts_test)
                # print('Target => ', list_target_texts_test)

                # text = self.segmentator.run(list_input_texts_test, self.device)

                avg_val_bleu, predict_text = self.val_using_testset()

                self.sw.add_scalar(tag='avg_val_bleu',
                                   value=avg_val_bleu,
                                   global_step=_step)

            # self.options['learning_rate'] = self.trainer.learning_rate * self.lr_decay_rate_epoch

            # self.trainer = mx.gluon.Trainer(self.segmentator.collect_params(), self.optimizer, self.options)

    def test(self):

        self.segmentator.hybridize()
        self.loader = self.reviewdata_val.get_loader()
        for i, batch in enumerate(self.loader):

            str_input_text, str_target_text = batch
            str_predict_text = self.segmentator.run(str_input_text,
                                                    self.test_device)

            score = sentence_bleu([[t for t in str_predict_text]],
                                  [t for t in str_target_text])
            raise

        pass

    def val_using_testset(self):

        # self.segmentator.hybridize()
        progress_val = tqdm(total=self.config['int_val_set'], desc='cged')

        self.loader = self.reviewdata_val.get_loader()
        scores = []
        for i, batch in enumerate(self.loader):
            if i == self.config['int_val_set']:
                break
            str_input_text, str_target_text = batch
            str_predict_text = self.segmentator.run(str_input_text,
                                                    self.test_device)
            score = sentence_bleu([[t for t in str_predict_text]],
                                  [t for t in str_target_text])
            scores.append(score)
            progress_val.update(1)

        return np.mean(scores), str_predict_text

    def run(self, text):

        self.init_network()
        self.segmentator.hybridize()
        self.segmentator.run(text, self.device)

    def interact(self):

        self.init_network()
        self.segmentator.hybridize()
        shell(self.segmentator, self.arch_path, self.device).cmdloop()

    def train_CGED(self):
        epoch_start, _ = self.init_network()
        epoch_start += 1

        self.init_trainer(epoch_start)
        self.loader = self.data.get_loader()
        # self.loader_val = self.data_val.get_loader()

        self.num_samples = len(self.data.data)
        self.segmentator.hybridize()

        for e in range(epoch_start, epoch_start + 10):
            progress = tqdm(total=self.num_samples)
            for i, batch in enumerate(self.loader):
                nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_error_idx, list_input_texts, list_target_texts = batch
                _batch_size = int(nd_input_word_idx.shape[0] /
                                  len(self.device)) * len(self.device)

                nd_input_word_idx = gluon.utils.split_and_load(
                    nd_input_word_idx[:_batch_size], self.device)
                nd_input_valid_len = gluon.utils.split_and_load(
                    nd_input_valid_len[:_batch_size], self.device)
                nd_input_segment = gluon.utils.split_and_load(
                    nd_input_segment[:_batch_size], self.device)

                nd_error_idx = gluon.utils.split_and_load(
                    nd_error_idx[:_batch_size], self.device)
                # nd_start_idx = gluon.utils.split_and_load(nd_start_idx[:_batch_size], self.device)
                # nd_end_idx = gluon.utils.split_and_load(nd_end_idx[:_batch_size], self.device)

                if self.args.decoder:

                    loss = self.segmentator.train_CGEDDecoder(
                        nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_error_idx, self.device,
                        _batch_size, self.trainer)

                    # if (i % 100 == 0):

                    #   print("=" * 10)

                    #   print('Epoch {} Step {} => {}, LR : {}'.format(e, i, loss, self.trainer.learning_rate))

                    #   _, batch_test = enumerator_val.__next__()
                    #   list_input_texts_test = batch_test[6]
                    #   list_target_texts_test = batch_test[7]

                    #   print('Input => ', list_input_texts_test[0])
                    #   print('Target => ', list_target_texts_test[0])

                    #   text = self.segmentator.run(list_input_texts_test[0], self.device)

                else:

                    loss = self.segmentator.train_CGED(
                        nd_input_word_idx, nd_input_valid_len,
                        nd_input_segment, nd_error_idx, self.device,
                        _batch_size, self.trainer)

                progress.update(self.batch_size)

                if (i % 10 == 0):

                    loss = sum([_loss.asnumpy().mean()
                                for _loss in loss])  # / _batch_size
                    print('[*] Loss : {}, Lr : {}'.format(
                        loss, self.trainer.learning_rate))

            if e % self.save_freq == 0:
                save_gluon_model(self.segmentator, self.arch_path, e,
                                 0)  # use main     dataset
            elif i % self.save_freq_step == 0:
                save_gluon_model(self.segmentator, self.arch_path, e - 1,
                                 i)  # use main     dataset

            self.options[
                'learning_rate'] = self.trainer.learning_rate * self.lr_decay_rate_epoch

            self.trainer = mx.gluon.Trainer(self.segmentator.collect_params(),
                                            self.optimizer, self.options)

    def test_CGED(self):

        self.init_network()
        self.loader = self.cgeddata.get_loader()
        self.num_samples = len(self.cgeddata.data)
        self.segmentator.hybridize()

        progress = tqdm(total=self.num_samples)

        list_result = []
        for i, batch in enumerate(self.loader):
            nd_input_word_idx, nd_input_valid_len, nd_input_segment, list_ids, list_input_texts, list_target_texts = batch
            _batch_size = int(nd_input_word_idx.shape[0] /
                              len(self.device)) * len(self.device)

            nd_input_word_idx = gluon.utils.split_and_load(
                nd_input_word_idx[:_batch_size], self.device)
            nd_input_valid_len = gluon.utils.split_and_load(
                nd_input_valid_len[:_batch_size], self.device)
            nd_input_segment = gluon.utils.split_and_load(
                nd_input_segment[:_batch_size], self.device)

            if args.decoder:
                _predict_error_idx = self.segmentator.test_CGEDDecoder(
                    nd_input_word_idx, nd_input_valid_len, nd_input_segment,
                    self.device)
            else:
                _predict_error_idx = self.segmentator.test_CGED(
                    nd_input_word_idx, nd_input_valid_len, nd_input_segment,
                    self.device)

            _list_result = self.cgeddata.convert_to_report_format(
                list_ids, _predict_error_idx)

            list_result.extend(_list_result)

            progress.update(self.batch_size)

        self.cgeddata.gen_eval_report(list_result)

    def train_paper(self):

        epoch_start, _ = self.init_network()
        epoch_start += 1

        self.init_trainer(epoch_start)
        self.loader = self.data.get_loader()
        # self.loader_val = self.data_val.get_loader()

        self.num_samples = len(self.data.data)
        self.segmentator.hybridize()
        pass

    def test_CSC(self):

        self.init_network()
        self.segmentator.hybridize()
        if self.args.dataset == 'CSC14':
            self.cscdata = self.csc14data
            self.cscloader = self.csc14data.get_loader()
            self.int_samples = len(self.csc14data.data)
        else:
            self.cscloader = self.csc15data.get_loader()
            self.int_samples = len(self.csc15data.data)
            self.cscdata = self.csc15data

        self.num_samples = len(self.cscdata.data)

        progress = tqdm(total=self.num_samples)

        reports = []
        for i, batch in enumerate(self.cscloader):
            nd_input_word_idx, nd_input_valid_len, nd_input_segment, list_input_texts, list_target_text, list_ids = batch
            print('input => ', list_input_texts[0])
            print('target => ', list_target_text[0])
            prediction = self.segmentator.run(list_input_texts[0], self.device)

            report = self.cscdata.compare_input_prediction(
                list_ids[0], prediction, list_input_texts[0])
            reports.append(report)
            progress.update(1)

        self.csc15data.gen_eval_report(reports)
Ejemplo n.º 31
0
def train(opt, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    kv = mx.kv.create(opt.kvstore)
    train_data, val_data, batch_fn = get_data_iters(opt)
    net.collect_params().reset_ctx(ctx)
    trainer = gluon.Trainer(net.collect_params(),
                            *get_optimizer(opt),
                            kvstore=kv)
    if opt.resume_states != '':
        trainer.load_states(opt.resume_states)
    loss = gluon.loss.SoftmaxCrossEntropyLoss()

    # dummy forward pass to initialize binary layers
    data, _ = get_dummy_data(opt, ctx[0])
    _ = net(data)

    if opt.mode == 'hybrid':
        net.hybridize()

    # set batch norm wd to zero
    params = net.collect_params('.*batchnorm.*')
    for key in params:
        params[key].wd_mult = 0.0

    if opt.plot_network is not None:
        plot_network()

    if opt.dry_run:
        return

    summary_writer = None
    if opt.write_summary:
        from mxboard import SummaryWriter
        summary_writer = SummaryWriter(logdir=opt.write_summary, flush_secs=60)
    write_net_summaries(summary_writer, ctx[0], write_grads=False)
    track_lr = LRTracker(trainer, summary_writer)

    total_time = 0
    num_epochs = 0
    best_acc = 0
    epoch_time = -1
    num_examples = get_num_examples(opt.dataset)

    for epoch in range(opt.start_epoch, opt.epochs):
        global_step = epoch * num_examples
        track_lr(epoch, global_step)
        tic = time.time()
        if hasattr(train_data, "reset"):
            train_data.reset()
        metric.reset()
        btic = time.time()
        for i, batch in enumerate(train_data):
            data, label = batch_fn(batch, ctx)
            outputs = []
            Ls = []
            with autograd.record():
                for x, y in zip(data, label):
                    z = net(x)
                    L = loss(z, y)
                    # store the loss and do backward after we have done forward
                    # on all GPUs for better speed on multiple GPUs.
                    Ls.append(L)
                    outputs.append(z)
                autograd.backward(Ls)
            trainer.step(batch_size)
            metric.update(label, outputs)

            if opt.log_interval and not (i + 1) % opt.log_interval:
                name, acc = metric.get()
                log_metrics("batch",
                            name,
                            acc,
                            epoch,
                            summary_writer,
                            global_step,
                            sep=" [%d]\tSpeed: %f samples/sec\t" %
                            (i, batch_size / (time.time() - btic)))
                log_progress(num_examples, opt, epoch, i,
                             time.time() - tic, epoch_time)
                track_lr(epoch, global_step)

            btic = time.time()
            global_step += batch_size
            if opt.test_run:
                break

        epoch_time = time.time() - tic

        write_net_summaries(summary_writer, ctx[0], global_step=global_step)

        # First epoch will usually be much slower than the subsequent epics,
        # so don't factor into the average
        if num_epochs > 0:
            total_time = total_time + epoch_time
        num_epochs = num_epochs + 1

        logger.info('[Epoch %d] time cost: %f' % (epoch, epoch_time))
        if summary_writer:
            summary_writer.add_scalar("training/epoch",
                                      epoch,
                                      global_step=global_step)
            summary_writer.add_scalar("training/epoch-time",
                                      epoch_time,
                                      global_step=global_step)

        # train
        name, acc = metric.get()
        log_metrics("training", name, acc, epoch, summary_writer, global_step)

        # test
        name, val_acc = test(ctx, val_data, batch_fn, opt.test_run)
        log_metrics("validation", name, val_acc, epoch, summary_writer,
                    global_step)

        if opt.interrupt_at is not None and epoch + 1 == opt.interrupt_at:
            logging.info(
                "[Epoch %d] Interrupting run now because 'interrupt-at' was set to %d..."
                % (epoch, opt.interrupt_at))
            save_checkpoint(trainer,
                            epoch,
                            val_acc[0],
                            best_acc,
                            force_save=True)
            sys.exit(3)

        # save model if meet requirements
        save_checkpoint(trainer, epoch, val_acc[0], best_acc)
        best_acc = max(best_acc, val_acc[0])

    if num_epochs > 1:
        print('Average epoch time: {}'.format(
            float(total_time) / (num_epochs - 1)))

    if opt.mode != 'hybrid':
        net.hybridize()
    # dummy forward pass to save model
    data, _ = get_dummy_data(opt, ctx[0])
    _ = net(data)
    net.export(os.path.join(opt.prefix,
                            "image-classifier-{}bit".format(opt.bits)),
               epoch=0)
Ejemplo n.º 32
0
class FIFOScheduler(TaskScheduler):
    r"""Simple scheduler that just runs trials in submission order.

    Parameters
    ----------
    train_fn: callable
        A task launch function for training.
    args: object (optional)
        Default arguments for launching train_fn.
    resource: dict
        Computation resources. For example, `{'num_cpus':2, 'num_gpus':1}`
    searcher: str or BaseSearcher
        Searcher (get_config decisions). If str, this is passed to
        searcher_factory along with search_options.
    search_options: dict
        If searcher is str, these arguments are passed to searcher_factory.
    checkpoint: str
        If filename given here, a checkpoint of scheduler (and searcher) state
        is written to file every time a job finishes.
        Note: May not be fully supported by all searchers.
    resume: bool
        If True, scheduler state is loaded from checkpoint, and experiment
        starts from there.
        Note: May not be fully supported by all searchers.
    num_trials: int
        Maximum number of jobs run in experiment.
    time_out: float
        If given, jobs are started only until this time_out (wall clock time)
    reward_attr: str
        Name of reward (i.e., metric to maximize) attribute in data obtained
        from reporter
    time_attr: str
        Name of resource (or time) attribute in data obtained from reporter.
        This attribute is optional for FIFO scheduling, but becomes mandatory
        in multi-fidelity scheduling (e.g., Hyperband).
        Note: The type of resource must be int.
    dist_ip_addrs: list of str
        IP addresses of remote machines.
    training_history_callback: callable
        Callback function func called every time a job finishes, if at least
        training_history_callback_delta_secs seconds passed since the last
        recent call. The call has the form:
            func(self.training_history, self._start_time)
        Here, self._start_time is time stamp for when experiment started.
        Use this callback to serialize self.training_history after regular
        intervals.
    training_history_callback_delta_secs: float
        See training_history_callback.
    delay_get_config: bool
        If True, the call to searcher.get_config is delayed until a worker
        resource for evaluation is available. Otherwise, get_config is called
        just after a job has been started.
        For searchers which adapt to past data, True should be preferred.
        Otherwise, it does not matter.


    Examples
    --------
    >>> import numpy as np
    >>> import autogluon as ag
    >>> @ag.args(
    ...     lr=ag.space.Real(1e-3, 1e-2, log=True),
    ...     wd=ag.space.Real(1e-3, 1e-2))
    >>> def train_fn(args, reporter):
    ...     print('lr: {}, wd: {}'.format(args.lr, args.wd))
    ...     for e in range(10):
    ...         dummy_accuracy = 1 - np.power(1.8, -np.random.uniform(e, 2*e))
    ...         reporter(epoch=e, accuracy=dummy_accuracy, lr=args.lr, wd=args.wd)
    >>> scheduler = ag.scheduler.FIFOScheduler(train_fn,
    ...                                        resource={'num_cpus': 2, 'num_gpus': 0},
    ...                                        num_trials=20,
    ...                                        reward_attr='accuracy',
    ...                                        time_attr='epoch')
    >>> scheduler.run()
    >>> scheduler.join_jobs()
    >>> scheduler.get_training_curves(plot=True)
    """

    def __init__(self, train_fn, args=None, resource=None,
                 searcher=None, search_options=None,
                 checkpoint=None,
                 resume=False, num_trials=None,
                 time_out=None, max_reward=None, reward_attr='accuracy',
                 time_attr='epoch',
                 visualizer='none', dist_ip_addrs=None,
                 training_history_callback=None,
                 training_history_callback_delta_secs=60,
                 delay_get_config=True):
        super().__init__(dist_ip_addrs)
        if resource is None:
            resource = {'num_cpus': 1, 'num_gpus': 0}
        self.resource = resource

        if searcher is None:
            searcher = 'random'  # RandomSearcher
        if isinstance(searcher, str):
            if search_options is None:
                search_options = dict()
            _search_options = search_options.copy()
            _search_options['configspace'] = train_fn.cs
            _search_options['reward_attribute'] = reward_attr
            # Adjoin scheduler info to search_options, if not already done by
            # subclass
            if 'scheduler' not in _search_options:
                _search_options['scheduler'] = 'fifo'
            self.searcher: BaseSearcher = searcher_factory(
                searcher, **_search_options)
        else:
            assert isinstance(searcher, BaseSearcher)
            self.searcher: BaseSearcher = searcher

        assert isinstance(train_fn, _autogluon_method)
        self.train_fn = train_fn
        self.args = args if args else train_fn.args
        if num_trials is None:
            assert time_out is not None, \
                "Need stopping criterion: Either num_trials or time_out"
            num_trials = 100000  # time_out is what matters
        self.num_trials = num_trials
        self.time_out = time_out
        self.max_reward = max_reward
        # meta data
        self.metadata = {
            'search_space': train_fn.kwspaces,
            'search_strategy': searcher,
            'stop_criterion': {
                'time_limits': time_out, 'max_reward': max_reward},
            'resources_per_trial': resource}

        self._checkpoint = checkpoint
        self._reward_attr = reward_attr
        self._time_attr = time_attr
        self.visualizer = visualizer.lower()
        if self.visualizer == 'tensorboard' or self.visualizer == 'mxboard':
            assert checkpoint is not None, \
                "Need checkpoint to be set"
            try_import_mxboard()
            from mxboard import SummaryWriter
            self.mxboard = SummaryWriter(
                logdir=os.path.join(os.path.splitext(checkpoint)[0], 'logs'),
                flush_secs=3,
                verbose=False
            )

        self._fifo_lock = mp.Lock()
        # training_history maintains the complete history of the experiment,
        # in terms of all results obtained from the reporter. Keys are
        # str(task.task_id)
        self.training_history = OrderedDict()
        self.config_history = OrderedDict()
        # Resume experiment from checkpoint?
        if resume:
            assert checkpoint is not None, \
                "Need checkpoint to be set if resume = True"
            if os.path.isfile(checkpoint):
                self.load_state_dict(load(checkpoint))
            else:
                msg = f'checkpoint path {checkpoint} is not available for resume.'
                logger.exception(msg)
                raise FileExistsError(msg)
        # Needed for training_history callback mechanism, which is used to
        # serialize training_history after each
        # training_history_call_delta_secs seconds
        self._start_time = None
        self._training_history_callback_last_block = None
        self._training_history_callback_last_len = None
        self.training_history_callback = training_history_callback
        self.training_history_callback_delta_secs = \
            training_history_callback_delta_secs
        self._delay_get_config = delay_get_config

    def run(self, **kwargs):
        """Run multiple number of trials
        """
        # Make sure that this scheduler is configured at the searcher
        self.searcher.configure_scheduler(self)
        start_time = time.time()
        self._start_time = start_time
        num_trials = kwargs.get('num_trials', self.num_trials)
        time_out = kwargs.get('time_out', self.time_out)
        # For training_history callback mechanism:
        self._training_history_callback_last_block = -1
        self._training_history_callback_last_len = len(self.training_history)

        logger.info('Starting Experiments')
        logger.info(f'Num of Finished Tasks is {self.num_finished_tasks}')
        logger.info(f'Num of Pending Tasks is {num_trials - self.num_finished_tasks}')
        if time_out is not None:
            logger.info(f'Time out (secs) is {time_out}')
        # TODO: This bar is misleading if num_trials not set
        tbar = tqdm(range(self.num_finished_tasks, num_trials))
        for _ in tbar:
            if (time_out and time.time() - start_time >= time_out) or \
                    (self.max_reward and self.get_best_reward() >= self.max_reward):
                break
            self.schedule_next()

    def save(self, checkpoint=None):
        """Save Checkpoint
        """
        if checkpoint is None:
            checkpoint = self._checkpoint
        if checkpoint is not None:
            mkdir(os.path.dirname(checkpoint))
            save(self.state_dict(), checkpoint)

    def _create_new_task(self, config, resources=None):
        if resources is None:
            resources = DistributedResource(**self.resource)
        return Task(
            self.train_fn, {'args': self.args, 'config': config},
            resources=resources)

    def schedule_next(self):
        """Schedule next searcher suggested task
        """
        resources = DistributedResource(**self.resource)
        if self._delay_get_config:
            # Wait for available resource here, instead of in add_job. This
            # delays the get_config call until a resource is available
            FIFOScheduler.RESOURCE_MANAGER._request(resources)

        # Allow for the promotion of a previously chosen config. Also,
        # extra_kwargs contains extra info passed to both add_job and to
        # get_config (if no config is promoted)
        config, extra_kwargs = self._promote_config()
        # Time stamp to be used in get_config, and maybe in add_job
        extra_kwargs['elapsed_time'] = self._elapsed_time()
        if config is None:
            # No config to promote: Query next config to evaluate from searcher
            config = self.searcher.get_config(**extra_kwargs)
            extra_kwargs['new_config'] = True
        else:
            # This is not a new config, but a paused one which is now promoted
            extra_kwargs['new_config'] = False
        task = self._create_new_task(config, resources=resources)
        self.add_job(task, **extra_kwargs)

    def run_with_config(self, config):
        """Run with config for final fit.
        It launches a single training trial under any fixed values of the hyperparameters.
        For example, after HPO has identified the best hyperparameter values based on a hold-out dataset,
        one can use this function to retrain a model with the same hyperparameters on all the available labeled data
        (including the hold out set). It can also returns other objects or states.
        """
        task = self._create_new_task(config)
        reporter = FakeReporter()
        task.args['reporter'] = reporter
        return self.run_job(task)

    def _dict_from_task(self, task):
        if isinstance(task, Task):
            return {'TASK_ID': task.task_id, 'Config': task.args['config']}
        else:
            assert isinstance(task, dict)
            return {'TASK_ID': task['TASK_ID'], 'Config': task['Config']}

    def add_job(self, task, **kwargs):
        """Adding a training task to the scheduler.

        Args:
            task (:class:`autogluon.scheduler.Task`): a new training task

        Relevant entries in kwargs:
            - bracket: HB bracket to be used. Has been sampled in _promote_config
            - new_config: If True, task starts new config eval, otherwise it promotes
              a config (only if type == 'promotion')

        Only if new_config == False:
            - config_key: Internal key for config
            - resume_from: config promoted from this milestone
            - milestone: config promoted to this milestone (next from resume_from)
        """
        cls = FIFOScheduler
        if not self._delay_get_config:
            # Wait for resource to become available here, as this has not happened
            # in schedule_next before
            cls.RESOURCE_MANAGER._request(task.resources)
        # reporter
        reporter = DistStatusReporter(remote=task.resources.node)
        task.args['reporter'] = reporter
        # Register pending evaluation
        self.searcher.register_pending(task.args['config'])
        # main process
        job = cls._start_distributed_job(task, cls.RESOURCE_MANAGER)
        # reporter thread
        rp = threading.Thread(
            target=self._run_reporter,
            args=(task, job, reporter),
            daemon=False)
        rp.start()
        task_dict = self._dict_from_task(task)
        task_dict.update({'Task': task, 'Job': job, 'ReporterThread': rp})
        # Checkpoint thread. This is also used for training_history
        # callback
        if self._checkpoint is not None or \
                self.training_history_callback is not None:
            self._add_checkpointing_to_job(job)
        with self.LOCK:
            self.scheduled_tasks.append(task_dict)

    def _clean_task_internal(self, task_dict):
        task_dict['ReporterThread'].join()

    def _add_checkpointing_to_job(self, job):
        def _save_checkpoint_callback(fut):
            self._cleaning_tasks()
            self.save()
            # training_history callback
            with self._fifo_lock:
                if self._trigger_training_history_callback():
                    logger.debug("Execute training_history callback")
                    self.training_history_callback(
                        self.training_history, self._start_time)

        job.add_done_callback(_save_checkpoint_callback)

    def _trigger_training_history_callback(self):
        if self.training_history_callback is None:
            return False
        assert self._training_history_callback_last_block is not None
        current_block = int(np.floor(
            self._elapsed_time() / self.training_history_callback_delta_secs))
        current_len = len(self.training_history)
        ret_val = (current_block >
                   self._training_history_callback_last_block) and \
            current_len > self._training_history_callback_last_len
        if ret_val:
            self._training_history_callback_last_block = current_block
            self._training_history_callback_last_len = current_len
        return ret_val

    def _run_reporter(self, task, task_job, reporter):
        last_result = None
        while not task_job.done():
            reported_result = reporter.fetch()
            # Time since start of experiment
            elapsed_time = self._elapsed_time()
            reported_result['time_since_start'] = elapsed_time
            if 'traceback' in reported_result:
                # Evaluation has failed
                logger.exception(reported_result['traceback'])
                self.searcher.evaluation_failed(
                    config=task.args['config'], **reported_result)
                reporter.move_on()
                break
            if reported_result.get('done', False):
                reporter.move_on()
                break
            # Extra information from searcher (optional)
            dataset_size = self.searcher.dataset_size()
            if dataset_size > 0:
                reported_result['searcher_data_size'] = dataset_size
            for k, v in self.searcher.cumulative_profile_record().items():
                reported_result['searcher_profile_' + k] = v
            for k, v in self.searcher.model_parameters().items():
                reported_result['searcher_params_' + k] = v
            self._add_training_result(
                task.task_id, reported_result, config=task.args['config'])
            reporter.move_on()
            last_result = reported_result
        # Pass all of last_result to searcher
        if last_result is not None:
            self.searcher.update(config=task.args['config'], **last_result)

    def _promote_config(self):
        """
        Provides a hook in schedule_next, which allows to promote a config
        which has been selected and partially evaluated previously.

        :return: config, extra_args
        """
        config = None
        extra_args = dict()
        return config, extra_args

    def _elapsed_time(self):
        """
        :return: Time elapsed since start of experiment (see 'run')
        """
        assert self._start_time is not None, \
            "Experiment has not been started yet"
        return time.time() - self._start_time

    def get_best_config(self):
        """Get the best configuration from the finished jobs.
        """
        return self.searcher.get_best_config()

    def get_best_reward(self):
        """Get the best reward from the finished jobs.
        """
        return self.searcher.get_best_reward()

    def _add_training_result(self, task_id, reported_result, config=None):
        if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard':
            if 'loss' in reported_result:
                self.mxboard.add_scalar(
                    tag='loss',
                    value=(
                        f'task {task_id} valid_loss',
                        reported_result['loss']
                    ),
                    global_step=reported_result[self._reward_attr]
                )
            self.mxboard.add_scalar(
                tag=self._reward_attr,
                value=(
                    f'task {task_id} {self._reward_attr}',
                    reported_result[self._reward_attr]
                ),
                global_step=reported_result[self._reward_attr]
            )
        with self._fifo_lock:
            # Note: We store all of reported_result in training_history[task_id],
            # not just the reward value.
            task_key = str(task_id)
            new_entry = copy.copy(reported_result)
            if task_key in self.training_history:
                self.training_history[task_key].append(new_entry)
            else:
                self.training_history[task_key] = [new_entry]
                if config:
                    self.config_history[task_key] = config

    def get_training_curves(self, filename=None, plot=False, use_legend=True):
        """Get Training Curves

        Parameters
        ----------
            filename : str
            plot : bool
            use_legend : bool

        Examples
        --------
        >>> scheduler.run()
        >>> scheduler.join_jobs()
        >>> scheduler.get_training_curves(plot=True)

            .. image:: https://github.com/zhanghang1989/AutoGluonWebdata/blob/master/doc/api/autogluon.1.png?raw=true
        """
        if filename is None and not plot:
            logger.warning('Please either provide filename or allow plot in get_training_curves')
        import matplotlib.pyplot as plt
        plt.ylabel(self._reward_attr)
        plt.xlabel(self._time_attr)
        plt.title("Performance vs Training-Time in each HPO Trial")
        with self._fifo_lock:
            for task_id, task_res in self.training_history.items():
                rewards = [x[self._reward_attr] for x in task_res]
                x = list(range(len(task_res)))
                plt.plot(x, rewards, label=f'task {task_id}')
        if use_legend:
            plt.legend(loc='best')
        if filename:
            logger.info(f'Saving Training Curve in {filename}')
            plt.savefig(filename)
        if plot:
            plt.show()

    def state_dict(self, destination=None):
        """
        Returns a dictionary containing a whole state of the Scheduler. This is
        used for checkpointing.

        Note that the checkpoint only contains information which has been
        registered at scheduler and searcher. It does not contain information
        about currently running jobs, except what they reported before the
        checkpoint.
        Therefore, resuming an experiment from a checkpoint is slightly
        different from continuing the experiment past the checkpoint. The
        former behaves as if all currently running jobs are terminated at
        the checkpoint, and new jobs are scheduled from there, starting from
        scheduler and searcher state according to all information recorded
        until the checkpoint.

        Examples
        --------
        >>> ag.save(scheduler.state_dict(), 'checkpoint.ag')
        """
        destination = super().state_dict(destination)
        with self._fifo_lock:
            # The result of searcher.get_state can always be pickled
            destination['searcher'] = pickle.dumps(self.searcher.get_state())
            destination['training_history'] = json.dumps(self.training_history)
            destination['config_history'] = json.dumps(self.config_history)
        if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard':
            destination['visualizer'] = json.dumps(self.mxboard._scalar_dict)
        return destination

    def load_state_dict(self, state_dict):
        """
        Load from the saved state dict. This can be used to resume an
        experiment from a checkpoint (see 'state_dict' for caveats).

        This method must only be called as part of scheduler construction.
        Calling it in the middle of an experiment can lead to an undefined
        inner state of scheduler or searcher.

        Examples
        --------
        >>> scheduler.load_state_dict(ag.load('checkpoint.ag'))
        """
        super().load_state_dict(state_dict)
        with self._fifo_lock:
            self.searcher = self.searcher.clone_from_state(
                pickle.loads(state_dict['searcher']))
            self.training_history = json.loads(state_dict['training_history'])
            self.config_history = json.loads(state_dict['config_history'])
        if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard':
            self.mxboard._scalar_dict = json.loads(state_dict['visualizer'])
        logger.debug(f'Loading Searcher State {self.searcher}')
Ejemplo n.º 33
0
def run(mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        anchor_alloc_size=[256, 256],
        anchor_sizes=[32, 64, 128, 256, 512],
        anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)],
        anchor_aspect_ratios=[0.5, 1, 2],
        anchor_box_clip=True,
        graphviz=True,
        epoch=100,
        input_size=[512, 512],
        batch_log=100,
        batch_size=16,
        batch_interval=10,
        subdivision=4,
        train_dataset_path="Dataset/train",
        valid_dataset_path="Dataset/valid",
        multiscale=True,
        factor_scale=[8, 5],
        foreground_iou_thresh=0.5,
        background_iou_thresh=0.4,
        data_augmentation=True,
        num_workers=4,
        optimizer="ADAM",
        weight_decay=0.000001,
        save_period=5,
        load_period=10,
        learning_rate=0.001,
        decay_lr=0.999,
        decay_step=10,
        GPU_COUNT=0,
        base=0,
        AMP=True,
        valid_size=8,
        eval_period=5,
        tensorboard=True,
        valid_graph_path="valid_Graph",
        valid_html_auto_open=True,
        using_mlflow=True,
        decode_number=5000,
        multiperclass=True,
        nms_thresh=0.5,
        nms_topk=500,
        iou_thresh=0.5,
        except_class_thresh=0.05,
        plot_class_thresh=0.5):
    if GPU_COUNT == 0:
        ctx = mx.cpu(0)
        AMP = False
    elif GPU_COUNT == 1:
        ctx = mx.gpu(0)
    else:
        ctx = [mx.gpu(i) for i in range(GPU_COUNT)]

    # 운영체제 확인
    if platform.system() == "Linux":
        logging.info(f"{platform.system()} OS")
    elif platform.system() == "Windows":
        logging.info(f"{platform.system()} OS")
    else:
        logging.info(f"{platform.system()} OS")

    if isinstance(ctx, (list, tuple)):
        for i, c in enumerate(ctx):
            free_memory, total_memory = mx.context.gpu_memory_info(i)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
    else:
        if GPU_COUNT == 1:
            free_memory, total_memory = mx.context.gpu_memory_info(0)
            free_memory = round(free_memory / (1024 * 1024 * 1024), 2)
            total_memory = round(total_memory / (1024 * 1024 * 1024), 2)
            logging.info(
                f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB'
            )
        else:
            logging.info(f'Running on {ctx}')

    if GPU_COUNT > 0 and batch_size < GPU_COUNT:
        logging.info("batch size must be greater than gpu number")
        exit(0)

    if AMP:
        amp.init()

    if multiscale:
        logging.info("Using MultiScale")

    if data_augmentation:
        logging.info("Using Data Augmentation")

    logging.info("training Efficient Detector")
    input_shape = (1, 3) + tuple(input_size)

    net = Efficient(version=base,
                    anchor_sizes=anchor_sizes,
                    anchor_size_ratios=anchor_size_ratios,
                    anchor_aspect_ratios=anchor_aspect_ratios,
                    anchor_box_clip=anchor_box_clip,
                    alloc_size=anchor_alloc_size,
                    ctx=mx.cpu())
    train_dataloader, train_dataset = traindataloader(
        multiscale=multiscale,
        factor_scale=factor_scale,
        augmentation=data_augmentation,
        path=train_dataset_path,
        input_size=input_size,
        batch_size=batch_size,
        batch_interval=batch_interval,
        num_workers=num_workers,
        shuffle=True,
        mean=mean,
        std=std,
        net=net,
        foreground_iou_thresh=foreground_iou_thresh,
        background_iou_thresh=background_iou_thresh,
        make_target=True)

    train_update_number_per_epoch = len(train_dataloader)
    if train_update_number_per_epoch < 1:
        logging.warning("train batch size가 데이터 수보다 큼")
        exit(0)

    valid_list = glob.glob(os.path.join(valid_dataset_path, "*"))
    if valid_list:
        valid_dataloader, valid_dataset = validdataloader(
            path=valid_dataset_path,
            input_size=input_size,
            batch_size=valid_size,
            num_workers=num_workers,
            shuffle=True,
            mean=mean,
            std=std,
            net=net,
            foreground_iou_thresh=foreground_iou_thresh,
            background_iou_thresh=background_iou_thresh,
            make_target=True)
        valid_update_number_per_epoch = len(valid_dataloader)
        if valid_update_number_per_epoch < 1:
            logging.warning("valid batch size가 데이터 수보다 큼")
            exit(0)

    num_classes = train_dataset.num_class  # 클래스 수
    name_classes = train_dataset.classes

    optimizer = optimizer.upper()
    model = str(input_size[0]) + "_" + str(
        input_size[1]) + "_" + optimizer + "_EFF_" + str(base)

    weight_path = os.path.join("weights", f"{model}")
    sym_path = os.path.join(weight_path, f'{model}-symbol.json')
    param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params')
    optimizer_path = os.path.join(weight_path,
                                  f'{model}-{load_period:04d}.opt')

    if os.path.exists(param_path) and os.path.exists(sym_path):
        start_epoch = load_period
        logging.info(f"loading {os.path.basename(param_path)}\n")
        net = gluon.SymbolBlock.imports(sym_path, ['data'],
                                        param_path,
                                        ctx=ctx)
    else:
        start_epoch = 0
        net = Efficient(
            version=base,
            input_size=input_size,
            anchor_sizes=anchor_sizes,
            anchor_size_ratios=anchor_size_ratios,
            anchor_aspect_ratios=anchor_aspect_ratios,
            num_classes=num_classes,  # foreground만
            anchor_box_clip=anchor_box_clip,
            alloc_size=anchor_alloc_size,
            ctx=ctx)

        if isinstance(ctx, (list, tuple)):
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.summary(mx.nd.ones(shape=input_shape, ctx=ctx))
        '''
        active (bool, default True) – Whether to turn hybrid on or off.
        static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase.
        static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower.
        '''
        if multiscale:
            net.hybridize(active=True, static_alloc=True, static_shape=False)
        else:
            net.hybridize(active=True, static_alloc=True, static_shape=True)

    if start_epoch + 1 >= epoch + 1:
        logging.info("this model has already been optimized")
        exit(0)

    if tensorboard:
        summary = SummaryWriter(logdir=os.path.join("mxboard", model),
                                max_queue=10,
                                flush_secs=10,
                                verbose=False)
        if isinstance(ctx, (list, tuple)):
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0]))
        else:
            net.forward(mx.nd.ones(shape=input_shape, ctx=ctx))
        summary.add_graph(net)
    if graphviz:
        gluoncv.utils.viz.plot_network(net,
                                       shape=input_shape,
                                       save_prefix=model)

    # optimizer
    unit = 1 if (len(train_dataset) //
                 batch_size) < 1 else len(train_dataset) // batch_size
    step = unit * decay_step
    lr_sch = mx.lr_scheduler.FactorScheduler(step=step,
                                             factor=decay_lr,
                                             stop_factor_lr=1e-12,
                                             base_lr=learning_rate)

    for p in net.collect_params().values():
        if p.grad_req != "null":
            p.grad_req = 'add'
    '''
    update_on_kvstore : bool, default None
    Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
    suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
    provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
    '''
    if optimizer.upper() == "ADAM":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "beta1": 0.9,
                                    "beta2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "RMSPROP":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "gamma1": 0.9,
                                    "gamma2": 0.999,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    elif optimizer.upper() == "SGD":
        trainer = gluon.Trainer(net.collect_params(),
                                optimizer,
                                optimizer_params={
                                    "learning_rate": learning_rate,
                                    "lr_scheduler": lr_sch,
                                    "wd": weight_decay,
                                    "momentum": 0.9,
                                    'multi_precision': False
                                },
                                update_on_kvstore=False
                                if AMP else None)  # for Dynamic loss scaling
    else:
        logging.error("optimizer not selected")
        exit(0)

    if AMP:
        amp.init_trainer(trainer)

    # optimizer weight 불러오기
    if os.path.exists(optimizer_path):
        try:
            trainer.load_states(optimizer_path)
        except Exception as E:
            logging.info(E)
        else:
            logging.info(f"loading {os.path.basename(optimizer_path)}\n")
    '''
    localization loss -> Smooth L1 loss 
    confidence loss -> Focal 
    '''
    confidence_loss = FocalLoss(alpha=0.25,
                                gamma=2,
                                sparse_label=True,
                                from_sigmoid=False,
                                batch_axis=None,
                                num_class=num_classes,
                                reduction="sum",
                                exclude=False)

    localization_loss = HuberLoss(rho=1,
                                  batch_axis=None,
                                  reduction="sum",
                                  exclude=False)

    prediction = Prediction(batch_size=batch_size,
                            from_sigmoid=False,
                            num_classes=num_classes,
                            decode_number=decode_number,
                            nms_thresh=nms_thresh,
                            nms_topk=nms_topk,
                            except_class_thresh=except_class_thresh,
                            multiperclass=multiperclass)

    precision_recall = Voc_2007_AP(iou_thresh=iou_thresh,
                                   class_names=name_classes)

    ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx]
    start_time = time.time()
    for i in tqdm(range(start_epoch + 1, epoch + 1, 1),
                  initial=start_epoch + 1,
                  total=epoch):

        conf_loss_sum = 0
        loc_loss_sum = 0
        time_stamp = time.time()

        for batch_count, (image, _, cls_all, box_all,
                          _) in enumerate(train_dataloader, start=1):
            td_batch_size = image.shape[0]

            image = mx.nd.split(data=image, num_outputs=subdivision, axis=0)
            cls_all = mx.nd.split(data=cls_all,
                                  num_outputs=subdivision,
                                  axis=0)
            box_all = mx.nd.split(data=box_all,
                                  num_outputs=subdivision,
                                  axis=0)

            if subdivision == 1:
                image = [image]
                cls_all = [cls_all]
                box_all = [box_all]
            '''
            autograd 설명
            https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html
            '''
            with autograd.record(train_mode=True):

                cls_all_losses = []
                box_all_losses = []

                for image_split, cls_split, box_split in zip(
                        image, cls_all, box_all):

                    image_split = gluon.utils.split_and_load(image_split,
                                                             ctx_list,
                                                             even_split=False)
                    cls_split = gluon.utils.split_and_load(cls_split,
                                                           ctx_list,
                                                           even_split=False)
                    box_split = gluon.utils.split_and_load(box_split,
                                                           ctx_list,
                                                           even_split=False)

                    # prediction, target space for Data Parallelism
                    cls_losses = []
                    box_losses = []
                    total_loss = []

                    # gpu N 개를 대비한 코드 (Data Parallelism)
                    for img, cls_target, box_target in zip(
                            image_split, cls_split, box_split):
                        cls_pred, box_pred, anchor = net(img)
                        except_ignore_samples = cls_target > -1
                        positive_samples = cls_target > 0
                        positive_numbers = positive_samples.sum()

                        conf_loss = confidence_loss(
                            cls_pred, cls_target,
                            except_ignore_samples.expand_dims(axis=-1))
                        conf_loss = mx.nd.divide(conf_loss,
                                                 positive_numbers + 1)
                        cls_losses.append(conf_loss.asscalar())

                        loc_loss = localization_loss(
                            box_pred, box_target,
                            positive_samples.expand_dims(axis=-1))
                        box_losses.append(loc_loss.asscalar())

                        total_loss.append(conf_loss + loc_loss)
                    if AMP:
                        with amp.scale_loss(total_loss,
                                            trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(total_loss)

                    cls_all_losses.append(sum(cls_losses))
                    box_all_losses.append(sum(box_losses))

            trainer.step(batch_size=td_batch_size, ignore_stale_grad=False)
            # 비우기
            for p in net.collect_params().values():
                p.zero_grad()

            conf_loss_sum += sum(cls_all_losses) / td_batch_size
            loc_loss_sum += sum(box_all_losses) / td_batch_size

            if batch_count % batch_log == 0:
                logging.info(
                    f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],'
                    f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],'
                    f'[Lr = {trainer.learning_rate}]'
                    f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]'
                    f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]'
                )
            time_stamp = time.time()

        train_conf_loss_mean = np.divide(conf_loss_sum,
                                         train_update_number_per_epoch)
        train_loc_loss_mean = np.divide(loc_loss_sum,
                                        train_update_number_per_epoch)
        train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean

        logging.info(
            f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}"
        )

        if i % save_period == 0:

            weight_epoch_path = os.path.join(weight_path, str(i))
            if not os.path.exists(weight_epoch_path):
                os.makedirs(weight_epoch_path)

            # optimizer weight 저장하기
            try:
                trainer.save_states(
                    os.path.join(weight_path, f'{model}-{i:04d}.opt'))
            except Exception as E:
                logging.error(f"optimizer weight export 예외 발생 : {E}")
            else:
                logging.info("optimizer weight export 성공")
            '''
            Hybrid models can be serialized as JSON files using the export function
            Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface.
            When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc.
            '''
            if GPU_COUNT >= 1:
                context = mx.gpu(0)
            else:
                context = mx.cpu(0)
            '''
                mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. 
                -yolo v3, gaussian yolo v3 에서는 문제가 발생한다.
                mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다.  

                block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님)
                export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 
                미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 
                이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다.
            '''
            auxnet = Prediction(from_sigmoid=False,
                                num_classes=num_classes,
                                decode_number=decode_number,
                                nms_thresh=nms_thresh,
                                nms_topk=nms_topk,
                                except_class_thresh=except_class_thresh,
                                multiperclass=multiperclass)
            postnet = PostNet(net=net, auxnet=auxnet)
            try:
                net.export(os.path.join(weight_path, f"{model}"),
                           epoch=i,
                           remove_amp_cast=True)
                net.save_parameters(os.path.join(weight_path,
                                                 f"{i}.params"))  # onnx 추출용
                # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함
                export_block_for_cplusplus(
                    path=os.path.join(weight_epoch_path, f"{model}_prepost"),
                    block=postnet,
                    data_shape=tuple(input_size) + tuple((3, )),
                    epoch=i,
                    preprocess=
                    True,  # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨
                    layout='HWC',
                    ctx=context,
                    remove_amp_cast=True)

            except Exception as E:
                logging.error(f"json, param model export 예외 발생 : {E}")
            else:
                logging.info("json, param model export 성공")
                net.collect_params().reset_ctx(ctx)

        if i % eval_period == 0 and valid_list:

            conf_loss_sum = 0
            loc_loss_sum = 0

            # loss 구하기
            for image, label, cls_all, box_all, _ in valid_dataloader:

                vd_batch_size = image.shape[0]

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)
                cls_all = gluon.utils.split_and_load(cls_all,
                                                     ctx_list,
                                                     even_split=False)
                box_all = gluon.utils.split_and_load(box_all,
                                                     ctx_list,
                                                     even_split=False)

                # prediction, target space for Data Parallelism
                cls_losses = []
                box_losses = []

                # gpu N 개를 대비한 코드 (Data Parallelism)
                for img, lb, cls_target, box_target in zip(
                        image, label, cls_all, box_all):
                    gt_box = lb[:, :, :4]
                    gt_id = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    id, score, bbox = prediction(cls_pred, box_pred, anchor)

                    precision_recall.update(pred_bboxes=bbox,
                                            pred_labels=id,
                                            pred_scores=score,
                                            gt_boxes=gt_box,
                                            gt_labels=gt_id)

                    except_ignore_samples = cls_target > -1
                    positive_samples = cls_target > 0
                    positive_numbers = positive_samples.sum()

                    conf_loss = confidence_loss(
                        cls_pred, cls_target,
                        except_ignore_samples.expand_dims(axis=-1))
                    conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1)
                    cls_losses.append(conf_loss.asscalar())

                    loc_loss = localization_loss(
                        box_pred, box_target,
                        positive_samples.expand_dims(axis=-1))
                    box_losses.append(loc_loss.asscalar())

                conf_loss_sum += sum(cls_losses) / vd_batch_size
                loc_loss_sum += sum(box_losses) / vd_batch_size

            valid_conf_loss_mean = np.divide(conf_loss_sum,
                                             valid_update_number_per_epoch)
            valid_loc_loss_mean = np.divide(loc_loss_sum,
                                            valid_update_number_per_epoch)
            valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean

            logging.info(
                f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}"
            )

            AP_appender = []
            round_position = 2
            class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list(
            )
            for j, c, p, r in zip(range(len(recall)), class_name, precision,
                                  recall):
                name, AP = precision_recall.get_AP(c, p, r)
                logging.info(
                    f"class {j}'s {name} AP : {round(AP * 100, round_position)}%"
                )
                AP_appender.append(AP)

            AP_appender = np.nan_to_num(AP_appender)
            mAP_result = np.mean(AP_appender)

            logging.info(f"mAP : {round(mAP_result * 100, round_position)}%")
            precision_recall.get_PR_curve(name=class_name,
                                          precision=precision,
                                          recall=recall,
                                          threshold=threshold,
                                          AP=AP_appender,
                                          mAP=mAP_result,
                                          folder_name=valid_graph_path,
                                          epoch=i,
                                          auto_open=valid_html_auto_open)
            precision_recall.reset()

            if tensorboard:
                # gpu N 개를 대비한 코드 (Data Parallelism)
                dataloader_iter = iter(valid_dataloader)
                image, label, _, _, _ = next(dataloader_iter)

                image = gluon.utils.split_and_load(image,
                                                   ctx_list,
                                                   even_split=False)
                label = gluon.utils.split_and_load(label,
                                                   ctx_list,
                                                   even_split=False)

                ground_truth_colors = {}
                for k in range(num_classes):
                    ground_truth_colors[k] = (0, 1, 0)

                batch_image = []
                for img, lb in zip(image, label):
                    gt_boxes = lb[:, :, :4]
                    gt_ids = lb[:, :, 4:5]
                    cls_pred, box_pred, anchor = net(img)
                    ids, scores, bboxes = prediction(cls_pred, box_pred,
                                                     anchor)

                    for ig, gt_id, gt_box, id, score, bbox in zip(
                            img, gt_ids, gt_boxes, ids, scores, bboxes):
                        ig = ig.transpose((1, 2, 0)) * mx.nd.array(
                            std, ctx=ig.context) + mx.nd.array(mean,
                                                               ctx=ig.context)
                        ig = (ig * 255).clip(0, 255)
                        ig = ig.astype(np.uint8)

                        # ground truth box 그리기
                        ground_truth = plot_bbox(
                            ig,
                            gt_box,
                            scores=None,
                            labels=gt_id,
                            thresh=None,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True,
                            colors=ground_truth_colors)
                        # prediction box 그리기
                        prediction_box = plot_bbox(
                            ground_truth,
                            bbox,
                            scores=score,
                            labels=id,
                            thresh=plot_class_thresh,
                            reverse_rgb=False,
                            class_names=valid_dataset.classes,
                            absolute_coordinates=True)

                        # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다.
                        prediction_box = np.transpose(prediction_box,
                                                      axes=(2, 0, 1))
                        batch_image.append(
                            prediction_box)  # (batch, channel, height, width)

                summary.add_image(tag="valid_result",
                                  image=np.array(batch_image),
                                  global_step=i)
                summary.add_scalar(tag="conf_loss",
                                   value={
                                       "train_conf_loss": train_conf_loss_mean,
                                       "valid_conf_loss": valid_conf_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="loc_loss",
                                   value={
                                       "train_loc_loss": train_loc_loss_mean,
                                       "valid_loc_loss": valid_loc_loss_mean
                                   },
                                   global_step=i)
                summary.add_scalar(tag="total_loss",
                                   value={
                                       "train_total_loss":
                                       train_total_loss_mean,
                                       "valid_total_loss":
                                       valid_total_loss_mean
                                   },
                                   global_step=i)

                for p in net.collect_params().values():
                    summary.add_histogram(tag=p.name,
                                          values=p.data(ctx=ctx_list[0]),
                                          global_step=i,
                                          bins='default')

    end_time = time.time()
    learning_time = end_time - start_time
    logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H")
    logging.info("optimization completed")

    if using_mlflow:
        ml.log_metric("learning time", round(learning_time / 3600, 2))