def get_lr_scheduler(args, train_loader):
    if args.optim_phase == 'Factor':
        every_lr_decay_step = args.every_lr_decay_step
        lr_scheduler = FactorScheduler(step=every_lr_decay_step, factor=0.1)
    elif args.optim_phase == 'MultiFactor':
        lr_decay_steps = [
            len(train_loader) * ep for ep in args.lr_decay_epochs
        ]
        lr_scheduler = MultiFactorScheduler(step=lr_decay_steps, factor=0.1)
    elif args.optim_phase == 'Poly':
        max_update_step = args.epochs
        lr_scheduler = PolyScheduler(max_update=max_update_step)
    elif args.optim_phase == 'Cosine':
        max_update_step = args.epochs
        lr_scheduler = CosineScheduler(max_update=max_update_step)
    else:
        raise ValueError('Invalid phase {}'.format(args.optim_phase))
    return lr_scheduler
Esempio n. 2
0
def test_lognormal():
    var = mx.symbol.Variable('var')
    data = mx.symbol.Variable('data')
    net_mean = mx.symbol.FullyConnected(data=data,
                                        name='fc_mean_1',
                                        num_hidden=20)
    net_mean = mx.symbol.Activation(data=net_mean,
                                    name='fc_mean_relu_1',
                                    act_type='relu')
    net_mean = mx.symbol.FullyConnected(data=data,
                                        name='fc_mean_2',
                                        num_hidden=20)
    net_mean = mx.symbol.Activation(data=net_mean,
                                    name='fc_mean_relu_2',
                                    act_type='relu')
    net_mean = mx.symbol.FullyConnected(data=net_mean,
                                        name='fc_mean_3',
                                        num_hidden=10)
    net_var = mx.symbol.FullyConnected(data=data,
                                       name='fc_var_1',
                                       num_hidden=10)
    net_var = mx.symbol.Activation(data=net_var,
                                   name='fc_var_softplus_1',
                                   act_type='softrelu')
    net = mx.symbol.Custom(mean=net_mean,
                           var=net_var,
                           name='policy',
                           deterministic=False,
                           entropy_regularization=0.01,
                           op_type='LogNormalPolicy')
    ctx = mx.gpu()
    minibatch_size = 100
    data_shapes = {
        'data': (minibatch_size, 10),
        'policy_score': (minibatch_size, )
    }  #, 'var':(minibatch_size,)}
    qnet = Base(data_shapes=data_shapes,
                sym_gen=net,
                name='PolicyNet',
                initializer=mx.initializer.Xavier(factor_type="in",
                                                  magnitude=1.0),
                ctx=ctx)
    print qnet.internal_sym_names

    lr = 0.01
    lr_scheduler = FactorScheduler(1000, 1.0 / 1.5)
    optimizer = mx.optimizer.create(
        name='sgd',
        learning_rate=lr,  #momentum=0.9,
        clip_gradient=None,
        lr_scheduler=lr_scheduler,
        rescale_grad=1.0,
        wd=0.)
    updater = mx.optimizer.get_updater(optimizer)
    total_iter = 1000000
    stats = numpy.zeros((total_iter, 3), dtype=numpy.float32)
    plt.ion()
    fig, ax = plt.subplots()
    lines, = ax.plot([], [])
    ax.set_autoscaley_on(True)
    baseline = 0
    for i in range(total_iter):
        #    for k, v in qnet.params.items():
        #        print k, v.asnumpy()
        data = numpy.random.randn(minibatch_size, 10)
        means = qnet.compute_internal(sym_name="fc_mean_3_output",
                                      data=data).asnumpy()
        vars = qnet.compute_internal(sym_name="fc_var_softplus_1_output",
                                     data=data).asnumpy()

        outputs = qnet.forward(
            is_train=True,
            data=data)  #, var=0.5*numpy.ones((minibatch_size, )))
        action = outputs[0].asnumpy()
        score = simple_game_multimodal(data, action, 1)
        baseline = baseline - 0.01 * (baseline - score.mean())
        print 'score=', score.mean(), 'err=', numpy.square(
            means -
            data * data).mean(), 'var=', vars.mean(), 'baseline=', baseline
        stats[i] = [
            score.mean(),
            numpy.square(means - data * data).mean(),
            vars.mean()
        ]
        qnet.backward(policy_score=score - baseline)
        norm_clipping(qnet.params_grad, 10)
        qnet.update(updater)
        if i % 10 == 0:
            update_line(lines, fig, ax, i,
                        score.mean())  #numpy.square(means - data*data).mean())
Esempio n. 3
0
    def train_model(self, action):
        # action belongs to stage4: Training stage
        if action[0] == 1:
            # LF1
            loss = mx.gluon.loss.L2Loss()
        else:
            loss = mx.gluon.loss.HuberLoss()

        # must set batch_size before init model
        batch_size = self.batch_size_option[action[1] - 1]
        self.config['batch_size'] = batch_size
        model = Model(self.action_trajectory, self.config, self.ctx,
                      self.adj_SIPM)
        model.initialize(ctx=self.ctx)
        lr_option = [1e-3, 7e-4, 1e-4]
        opt_option = ['rmsprop', 'adam', 'adam']
        lr = lr_option[action[2] - 1]
        if action[3] == 1:
            step = self.epochs / 10
            if step < 1:
                step = 1
            lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_scheduler})
        elif action[3] == 2:
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'learning_rate': lr})
        else:
            global_train_steps = self.training_samples // batch_size + 1
            max_update_factor = 1
            lr_sch = mx.lr_scheduler.PolyScheduler(
                max_update=global_train_steps * self.epochs *
                max_update_factor,
                base_lr=lr,
                pwr=2,
                warmup_steps=global_train_steps)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_sch})
        try:
            # train
            train_time = 0.
            train_loader, val_loader, test_loader = self.data[batch_size]
            for epoch in range(self.config['epochs']):
                loss_value = 0
                mae = 0
                rmse = 0
                mape = 0
                train_batch_num = 0
                for X in train_loader:
                    y = X.label[0]
                    X = X.data[0]
                    train_batch_num += 1
                    X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx)
                    with autograd.record():
                        y = y.astype('float32')
                        start_time = time()
                        output = model(X)
                        train_time += time() - start_time
                        l = loss(output, y)
                    if self.test:
                        return
                    l.backward()
                    opt.step(batch_size)
                    loss_value += loss(output, y).mean().asscalar()
                    mae += MAE(y, output)
                    rmse += RMSE(y, output)
                    mape += masked_mape_np(y, output)
                train_loader.reset()
                loss_value /= train_batch_num
                mae /= train_batch_num
                rmse /= train_batch_num
                mape /= train_batch_num
                self.logger(
                    train=[epoch, loss_value, mae, mape, rmse, train_time])
                print(f"    epoch:{epoch} ,loss:{loss_value}")
            model_structure = deepcopy(self.action_trajectory)
            model_structure.append(action)
            # eval
            eval_loss_value = 0
            eval_batch_num = 0
            mae = 0
            rmse = 0
            mape = 0
            val_time = 0.
            for X in val_loader:
                y = X.label[0]
                X = X.data[0]
                eval_batch_num += 1
                X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx)
                y = y.astype('float32')
                start_time = time()
                output = model(X)
                val_time += time() - start_time
                eval_loss_value += loss(output, y).mean().asscalar()
                mae += MAE(y, output)
                rmse += RMSE(y, output)
                mape += masked_mape_np(y, output)
            eval_loss_value /= eval_batch_num
            mae /= eval_batch_num
            rmse /= eval_batch_num
            mape /= eval_batch_num
            print(
                f"    eval_result: loss:{eval_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, time:{val_time}"
            )
            val_loader.reset()
            # get reward
            if self.time_max <= val_time:
                return -1, True
            else:
                reward = -(mae - np.power(np.e, -19) *
                           np.log2(self.time_max - val_time))
            if reward < -1e2:
                return -1, True
            else:
                reward /= 100
            self.logger(eval=[eval_loss_value, mae, mape, rmse, val_time])
            self.logger.save_GNN(model, model_structure,
                                 reward / len(self.action_trajectory) + 1)
            return reward, False
        except Exception as e:
            self.logger.append_log_file(e.args[0])
            self.logger(train=None, eval=None, test=None)
            traceback.print_exc()
            return -1, True
Esempio n. 4
0
def main(args):
    filehandler = logging.FileHandler(args['log_dir'] + '/train.log')
    streamhandler = logging.StreamHandler()

    logger = logging.getLogger('')
    logger.setLevel(logging.INFO)
    logger.addHandler(filehandler)
    logger.addHandler(streamhandler)

    batch_size = args['batch_size']
    classes = 1000

    num_gpus = args['num_gpus']
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = args['num_workers']
    model_name = 'efficientnet-' + args['model']
    lr_decay = args['lr_decay']
    lr_decay_period = args['lr_decay_period']
    warmup_steps = args['warmup_epochs']
    warmup_begin_lr = args['warmup_lr']
    assert lr_decay_period != 0
    lr_scheduler = FactorScheduler(lr_decay_period, lr_decay,
                               warmup_steps=warmup_steps, warmup_begin_lr=warmup_begin_lr)
    lr_scheduler.base_lr = args['lr']
    optimizer = 'rmsprop'
    optimizer_params = {'wd': args['wd'], 'gamma1': args['momentum'], 'learning_rate':args['lr']}
    if args['dtype'] != 'float32':
        optimizer_params['multi_precision'] = True
    net, input_size = get_efficientnet(model_name)
    net.cast(args['dtype'])
    if args['resume_params'] is not '':
        net.load_parameters(args['resume_params'], ctx=context)
    # Two functions for reading data from record file or raw images
    def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers):
        rec_train = os.path.expanduser(rec_train)
        rec_train_idx = os.path.expanduser(rec_train_idx)
        rec_val = os.path.expanduser(rec_val)
        rec_val_idx = os.path.expanduser(rec_val_idx)
        jitter_param = 0.4
        lighting_param = 0.1
        crop_ratio = args['crop_ratio'] if args['crop_ratio'] > 0 else 0.875
        resize = int(math.ceil(input_size / crop_ratio))
        mean_rgb = [123.68, 116.779, 103.939]
        std_rgb = [58.393, 57.12, 57.375]

        def batch_fn(batch, ctx):
            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
            return data, label

        train_data = mx.io.ImageRecordIter(
            path_imgrec=rec_train,
            path_imgidx=rec_train_idx,
            preprocess_threads=num_workers,
            shuffle=True,
            batch_size=batch_size,

            data_shape=(3, input_size, input_size),
            mean_r=mean_rgb[0],
            mean_g=mean_rgb[1],
            mean_b=mean_rgb[2],
            std_r=std_rgb[0],
            std_g=std_rgb[1],
            std_b=std_rgb[2],
            rand_mirror=True,
            random_resized_crop=True,
            max_aspect_ratio=4. / 3.,
            min_aspect_ratio=3. / 4.,
            max_random_area=1,
            min_random_area=0.08,
            brightness=jitter_param,
            saturation=jitter_param,
            contrast=jitter_param,
            pca_noise=lighting_param,
        )
        val_data = mx.io.ImageRecordIter(
            path_imgrec=rec_val,
            path_imgidx=rec_val_idx,
            preprocess_threads=num_workers,
            shuffle=False,
            batch_size=batch_size,

            resize=resize,
            data_shape=(3, input_size, input_size),
            mean_r=mean_rgb[0],
            mean_g=mean_rgb[1],
            mean_b=mean_rgb[2],
            std_r=std_rgb[0],
            std_g=std_rgb[1],
            std_b=std_rgb[2],
        )
        return train_data, val_data, batch_fn

    def get_data_loader(data_dir, batch_size, num_workers):
        normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        jitter_param = 0.4
        lighting_param = 0.1
        crop_ratio = args['crop_ratio'] if args['crop_ratio'] > 0 else 0.875
        resize = int(math.ceil(input_size / crop_ratio))

        def batch_fn(batch, ctx):
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
            return data, label

        transform_train = transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomFlipLeftRight(),
            transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param,
                                         saturation=jitter_param),
            transforms.RandomLighting(lighting_param),
            transforms.ToTensor(),
            normalize
        ])
        transform_test = transforms.Compose([
            transforms.Resize(resize, keep_ratio=True),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            normalize
        ])

        train_data = gluon.data.DataLoader(
            imagenet.classification.ImageNet(data_dir, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)
        val_data = gluon.data.DataLoader(
            imagenet.classification.ImageNet(data_dir, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        return train_data, val_data, batch_fn

    if args['use_rec']:
        train_data, val_data, batch_fn = get_data_rec(args['rec_train'], args['rec_train_idx'],
                                                      args['rec_val'], args['rec_val_idx'],
                                                      batch_size, num_workers)
    else:
        train_data, val_data, batch_fn = get_data_loader(args['data_dir'], batch_size, num_workers)

    if args['mixup']:
        train_metric = mx.metric.RMSE()
    else:
        train_metric = mx.metric.Accuracy()
    acc_top1 = mx.metric.Accuracy()
    acc_top5 = mx.metric.TopKAccuracy(5)

    save_frequency = args['save_frequency']
    if args['save_model'] and save_frequency:
        save_dir = args['log_dir']
    else:
        save_dir = ''
        save_frequency = 0

    def mixup_transform(label, classes, lam=1, eta=0.0):
        if isinstance(label, nd.NDArray):
            label = [label]
        res = []
        for l in label:
            y1 = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes)
            y2 = l[::-1].one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes)
            res.append(lam * y1 + (1 - lam) * y2)
        return res

    def smooth(label, classes, eta=0.1):
        if isinstance(label, nd.NDArray):
            label = [label]
        smoothed = []
        for l in label:
            res = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes)
            smoothed.append(res)
        return smoothed

    def test(ctx, val_data):
        if args['use_rec']:
            val_data.reset()
        acc_top1.reset()
        acc_top5.reset()
        for i, batch in enumerate(val_data):
            data, label = batch_fn(batch, ctx)
            outputs = [net(X.astype(args['dtype'], copy=False), ag.is_training()) for X in data]
            acc_top1.update(label, outputs)
            acc_top5.update(label, outputs)

        _, top1 = acc_top1.get()
        _, top5 = acc_top5.get()
        return (1 - top1, 1 - top5)

    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if args['resume_params'] is '':
            net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if args['no_wd']:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
        if args['resume_states'] is not '':
            trainer.load_states(args['resume_states'])

        if args['label_smoothing'] or args['mixup']:
            sparse_label_loss = False
        else:
            sparse_label_loss = True

        L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
        best_val_score = 1

        for epoch in range(args['resume_epoch'], args['num_epochs']):
            tic = time.time()
            if args['use_rec']:
                train_data.reset()
            train_metric.reset()
            btic = time.time()
            lr = lr_scheduler(epoch + 1)
            trainer.set_learning_rate(lr)
            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                if args['mixup']:
                    lam = np.random.beta(args['mixup_alpha'], args['mixup_alpha'])
                    if epoch >= args['num_epochs'] - args['mixup_off_epoch']:
                        lam = 1
                    data = [lam * X + (1 - lam) * X[::-1] for X in data]

                    if args['label_smoothing']:
                        eta = 0.1
                    else:
                        eta = 0.0
                    label = mixup_transform(label, classes, lam, eta)

                elif args['label_smoothing']:
                    hard_label = label
                    label = smooth(label, classes)

                with ag.record():
                    outputs = [net(X.astype(args['dtype'], copy=False), ag.is_training()) for X in data]
                    loss = [L(yhat, y.astype(args['dtype'], copy=False)) for yhat, y in zip(outputs, label)]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)

                if args['mixup']:
                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                                      for out in outputs]
                    train_metric.update(label, output_softmax)
                else:
                    if args['label_smoothing']:
                        train_metric.update(hard_label, outputs)
                    else:
                        train_metric.update(label, outputs)

                if args['log_interval'] and not (i + 1) % args['log_interval']:
                    train_metric_name, train_metric_score = train_metric.get()
                    logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f' % (
                        epoch, i, batch_size * args['log_interval'] / (time.time() - btic),
                        train_metric_name, train_metric_score, trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * (i+1) / (time.time() - tic))

            err_top1_val, err_top5_val = test(ctx, val_data)

            logger.info('[Epoch %d] training: %s=%f' % (epoch, train_metric_name, train_metric_score))
            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic))
            logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' % (epoch, err_top1_val, err_top5_val))

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                net.save_parameters(
                    '%s/%.4f-imagenet-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch))
                trainer.save_states(
                    '%s/%.4f-imagenet-%s-%d-best.states' % (save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
                net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, epoch))
                trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, epoch))

        if save_frequency and save_dir:
            net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, args['num_epochs'] - 1))
            trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, args['num_epochs'] - 1))

    if args['mode'] == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)
    train(context)
Esempio n. 5
0
    def train_model(self, actions: list):
        # remove [-1,-1,-1,-1]
        for idx in range(len(actions)):
            if actions[idx] == [-1, -1, -1, -1]:
                actions.pop(idx)
        # fetch training_stage_action and remove it from model structure action
        action = actions[0]
        actions.pop(0)
        self.action_trajectory = actions
        # action belongs to stage4: Training stage
        if action[0] == 1:
            # LF1
            loss = mx.gluon.loss.L2Loss()
        else:
            loss = mx.gluon.loss.HuberLoss()

        # must set batch_size before init model
        batch_size = self.batch_size_option[action[1] - 1]
        # transformer = self.transformer[batch_size]
        self.config['batch_size'] = batch_size
        model = Model(self.action_trajectory, self.config, self.ctx,
                      self.adj_SIPM)
        model.initialize(ctx=self.ctx)
        lr_option = [1e-3, 7e-4, 1e-4]
        opt_option = ['rmsprop', 'adam', 'adam']
        lr = lr_option[action[2] - 1]
        if action[3] == 1:
            step = self.epochs / 10
            if step < 1:
                step = 1
            lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_scheduler})
        elif action[3] == 2:
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'learning_rate': lr})
        else:
            global_train_steps = self.training_samples // batch_size + 1
            max_update_factor = 1
            lr_sch = mx.lr_scheduler.PolyScheduler(
                max_update=global_train_steps * self.epochs *
                max_update_factor,
                base_lr=lr,
                pwr=2,
                warmup_steps=global_train_steps)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_sch})
        # train
        train_loader, val_loader, test_loader = self.data[batch_size]
        model_structure = deepcopy(self.action_trajectory)
        model_structure.append(action)
        best_mae = float('inf')
        best_epoch = 0
        for epoch in range(config['epochs']):
            self.logger.set_episode(epoch)
            loss_value = 0
            mae = 0
            rmse = 0
            mape = 0
            train_batch_num = 0
            train_time = 0.
            for X in train_loader:
                y = X.label[0]
                X = X.data[0]
                train_batch_num += 1
                X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx)
                with autograd.record():
                    y = y.astype('float32')
                    start_time = time()
                    output = model(X)
                    train_time += time() - start_time
                    l = loss(output, y)
                if self.test:
                    return
                l.backward()
                opt.step(batch_size)
                # loss_value_raw += l.mean().asscalar()
                loss_value += loss(output, y).mean().asscalar()
                mae += MAE(y, output)
                rmse += RMSE(y, output)
                mape += masked_mape_np(y, output)
            train_loader.reset()
            # loss_value_raw /= train_batch_num
            loss_value /= train_batch_num
            mae /= train_batch_num
            rmse /= train_batch_num
            mape /= train_batch_num
            train_time = (time() - train_time) / self.train_set_sample_num
            self.logger(train=[epoch, loss_value, mae, mape, rmse, train_time])
            print(
                f"    epoch:{epoch}  ,loss:{loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, time:{train_time}"
            )
            # eval
            eval_loss_value = 0
            val_time = 0.
            eval_batch_num = 0
            mae = 0
            rmse = 0
            mape = 0
            val_time = time()
            for X in val_loader:
                y = X.label[0]
                X = X.data[0]
                eval_batch_num += 1
                X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx)
                y = y.astype('float32')
                start_time = time()
                output = model(X)
                val_time += time() - start_time
                # eval_loss_value_raw += loss(output, y).mean().asscalar()
                eval_loss_value += loss(output, y).mean().asscalar()
                mae += MAE(y, output)
                rmse += RMSE(y, output)
                mape += masked_mape_np(y, output)
            eval_loss_value /= eval_batch_num
            mae /= eval_batch_num
            rmse /= eval_batch_num
            mape /= eval_batch_num
            print(
                f"    eval_result: loss:{eval_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, time:{val_time}"
            )
            val_loader.reset()
            self.logger(eval=[eval_loss_value, mae, mape, rmse, val_time])
            self.logger.save_GNN(model, model_structure, mae)
            self.logger.update_data_units()
            self.logger.flush_log()
            if mae < best_mae:
                best_mae = mae
                best_epoch = epoch
            if epoch - best_epoch > 10:
                print(f'early stop at epoch:{epoch}')
                break
        # test
        # load best eval metric model parameters
        model.load_params(os.path.join(
            os.path.join(self.logger.log_path, 'GNN'),
            'best_GNN_model.params'),
                          ctx=self.ctx)
        test_loss_value = 0
        test_batch_num = 0
        mae = 0
        rmse = 0
        mape = 0
        test_time = 0.
        for X in test_loader:
            y = X.label[0]
            X = X.data[0]
            test_batch_num += 1
            X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx)
            y = y.astype('float32')
            start_time = time()
            output = model(X)
            test_time += time() - start_time
            # test_loss_value_raw += loss(output, y).mean().asscalar()
            test_loss_value += loss(output, y).mean().asscalar()
            mae += MAE(y, output)
            rmse += RMSE(y, output)
            mape += masked_mape_np(y, output)
        test_loss_value /= test_batch_num
        mae /= test_batch_num
        rmse /= test_batch_num
        mape /= test_batch_num
        test_loader.reset()
        print(
            f"    test_result: loss:{test_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, TIME:{test_time}"
        )
        self.logger(test=[test_loss_value, mae, mape, rmse, test_time])
        self.logger.update_data_units()
        self.logger.flush_log()
        return [mae, mape, rmse, test_time]
Esempio n. 6
0
    def test_model(self, actions: list):
        # remove [-1,-1,-1,-1]
        for idx in range(len(actions)):
            if actions[idx] == [-1, -1, -1, -1]:
                actions.pop(idx)
        # fetch training_stage_action and remove it from model structure action
        action = actions[0]
        actions.pop(0)
        self.action_trajectory = actions
        # action belongs to stage4: Training stage
        if action[0] == 1:
            # LF1
            loss = mx.gluon.loss.L2Loss()
        else:
            loss = mx.gluon.loss.HuberLoss()

        # must set batch_size before init model
        batch_size = self.batch_size_option[action[1] - 1]
        # transformer = self.transformer[batch_size]
        self.config['batch_size'] = batch_size
        model = Model(self.action_trajectory, self.config, self.ctx,
                      self.adj_SIPM)
        model.initialize(ctx=self.ctx)
        lr_option = [1e-3, 7e-4, 1e-4]
        opt_option = ['rmsprop', 'adam', 'adam']
        lr = lr_option[action[2] - 1]
        if action[3] == 1:
            step = self.epochs / 10
            if step < 1:
                step = 1
            lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_scheduler})
        elif action[3] == 2:
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'learning_rate': lr})
        else:
            global_train_steps = self.training_samples // batch_size + 1
            max_update_factor = 1
            lr_sch = mx.lr_scheduler.PolyScheduler(
                max_update=global_train_steps * self.epochs *
                max_update_factor,
                base_lr=lr,
                pwr=2,
                warmup_steps=global_train_steps)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_sch})
        # train
        train_loader, val_loader, test_loader = self.data[batch_size]
        # test
        # load best eval metric model parameters
        model.load_params(
            f'./Log/{self.dataset_name.upper()}_experiment2_qlearning_2_test/GNN/best_GNN_model.params',
            ctx=self.ctx)
        test_loss_value = 0
        test_batch_num = 0
        mae = 0
        rmse = 0
        mape = 0
        test_time = 0.
        for X in test_loader:
            y = X.label[0]
            X = X.data[0]
            test_batch_num += 1
            X, y = X.as_in_context(self.ctx), y.as_in_context(self.ctx)
            y = y.astype('float32')
            start_time = time()
            output = model(X)
            test_time += time() - start_time
            # test_loss_value_raw += loss(output, y).mean().asscalar()
            test_loss_value += loss(output, y).mean().asscalar()
            mae += MAE(y, output)
            rmse += RMSE(y, output)
            mape += masked_mape_np(y, output)
        test_loss_value /= test_batch_num
        mae /= test_batch_num
        rmse /= test_batch_num
        mape /= test_batch_num
        test_loader.reset()
        print(
            f"    test_result: loss:{test_loss_value}, MAE:{mae}, MAPE:{mape}, RMSE:{rmse}, TIME:{test_time}"
        )
        self.logger(test=[test_loss_value, mae, mape, rmse, test_time])
        self.logger.update_data_units()
        self.logger.flush_log()
        return [mae, mape, rmse, test_time]
Esempio n. 7
0
def train_net(args, ctx):
    logger.auto_set_dir()

    from symbols.tiny import resnet101_deeplab_new

    sym_instance = resnet101_deeplab_new()
    sym = sym_instance.get_symbol(NUM_CLASSES, is_train=True, memonger=False)

    #digraph = mx.viz.plot_network(sym, save_format='pdf')
    #digraph.render()

    # setup multi-gpu
    gpu_nums = len(ctx)
    input_batch_size = args.batch_size * gpu_nums

    train_data = get_data("train", DATA_DIR, LIST_DIR, len(ctx))
    test_data = get_data("val", DATA_DIR, LIST_DIR, len(ctx))

    # infer max shape
    max_scale = [args.crop_size]
    max_data_shape = [('data', (args.batch_size, 3,
                                max([v[0] for v in max_scale]),
                                max([v[1] for v in max_scale])))]
    max_label_shape = [('label', (args.batch_size, 1,
                                  max([v[0] for v in max_scale]),
                                  max([v[1] for v in max_scale])))]

    # infer shape
    data_shape_dict = {
        'data': (args.batch_size, 3, args.crop_size[0], args.crop_size[1]),
        'label': (args.batch_size, 1, args.crop_size[0], args.crop_size[1])
    }

    pprint.pprint(data_shape_dict)
    sym_instance.infer_shape(data_shape_dict)

    eval_sym_instance = resnet101_deeplab_new()

    # load and initialize params
    epoch_string = args.load.rsplit("-", 2)[1]
    begin_epoch = 1
    if not args.scratch:
        begin_epoch = int(epoch_string)
        logger.info('continue training from {}'.format(begin_epoch))
        arg_params, aux_params = load_init_param(args.load, convert=True)
    else:
        logger.info(args.load)
        arg_params, aux_params = load_init_param(args.load, convert=True)
        sym_instance.init_weights(arg_params, aux_params)

    # check parameter shapes
    sym_instance.check_parameter_shapes(arg_params, aux_params,
                                        data_shape_dict)

    data_names = ['data']
    label_names = ['label']

    mod = MutableModule(
        sym,
        data_names=data_names,
        label_names=label_names,
        context=ctx,
        max_data_shapes=[max_data_shape for _ in xrange(gpu_nums)],
        max_label_shapes=[max_label_shape for _ in xrange(gpu_nums)],
        fixed_param_prefix=fixed_param_prefix)

    # decide training params
    # metric
    fcn_loss_metric = metric.FCNLogLossMetric(args.frequent)
    eval_metrics = mx.metric.CompositeEvalMetric()

    for child_metric in [fcn_loss_metric]:
        eval_metrics.add(child_metric)

    # callback
    batch_end_callbacks = [
        callback.Speedometer(input_batch_size, frequent=args.frequent)
    ]
    #batch_end_callbacks = [mx.callback.ProgressBar(total=train_data.size/train_data.batch_size)]
    epoch_end_callbacks = \
        [mx.callback.module_checkpoint(mod, os.path.join(logger.get_logger_dir(),"mxnetgo"), period=1, save_optimizer_states=True),
         ]

    from mxnet.lr_scheduler import FactorScheduler
    lr_scheduler = FactorScheduler(800)

    # optimizer
    optimizer_params = {
        'wd': 0.0005,
        'learning_rate': 2.5e-2,
        'lr_scheduler': lr_scheduler,
        'rescale_grad': 1.0,
        'clip_gradient': None
    }

    logger.info("epoch scale = {}".format(EPOCH_SCALE))
    mod.fit(train_data=train_data,
            args=args,
            eval_sym_instance=eval_sym_instance,
            eval_data=test_data,
            eval_metric=eval_metrics,
            epoch_end_callback=epoch_end_callbacks,
            batch_end_callback=batch_end_callbacks,
            kvstore=kvstore,
            optimizer='sgld',
            optimizer_params=optimizer_params,
            arg_params=arg_params,
            aux_params=aux_params,
            begin_epoch=begin_epoch,
            num_epoch=end_epoch,
            epoch_scale=EPOCH_SCALE,
            validation_on_last=validation_on_last)
Esempio n. 8
0
data_shapes = {
    'data': (batch_size, state_dimension),
    'policy_score': (batch_size, ),
    'policy_backward_action': (batch_size, action_dimension),
    'critic_label': (batch_size, ),
    'var': (batch_size, action_dimension),
}
sym = actor_critic_policy_sym(action_dimension)
net = Base(data_shapes=data_shapes,
           sym_gen=sym,
           name='ACNet',
           initializer=mx.initializer.Xavier(rnd_type='gaussian',
                                             factor_type='avg',
                                             magnitude=1.0),
           ctx=ctx)
lr_scheduler = FactorScheduler(500, 0.1)
if args.optimizer == 'sgd':
    optimizer = mx.optimizer.create(name='sgd',
                                    learning_rate=args.lr,
                                    lr_scheduler=lr_scheduler,
                                    momentum=0.9,
                                    clip_gradient=None,
                                    rescale_grad=1.0,
                                    wd=0.)
elif args.optimizer == 'adam':
    optimizer = mx.optimizer.create(name='adam',
                                    learning_rate=args.lr,
                                    lr_scheduler=lr_scheduler)
else:
    raise ValueError('optimizer must be chosen between adam and sgd')
updater = mx.optimizer.get_updater(optimizer)
Esempio n. 9
0
    def train_model(self, action):
        # action belongs to stage4: Training stage
        if action[0] == 1:
            # LF1
            loss = mx.gluon.loss.L2Loss()
        else:
            loss = mx.gluon.loss.HuberLoss()
        # must set batch_size before init model
        batch_size = self.batch_size_option[action[1] - 1]
        self.config['batch_size'] = batch_size
        model = Model(self.action_trajectory, self.config, self.ctx,
                      self.adj_SIPM)
        model.initialize(ctx=self.ctx)
        lr_option = [1e-3, 7e-4, 1e-4]
        opt_option = ['rmsprop', 'adam', 'adam']
        lr = lr_option[action[2] - 1]
        if action[3] == 1:
            step = self.epochs / 10
            if step < 1:
                step = 1
            lr_scheduler = FactorScheduler(step, factor=0.7, base_lr=lr)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_scheduler})
        elif action[3] == 2:
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'learning_rate': lr})
        else:
            global_train_steps = self.training_samples // batch_size + 1
            max_update_factor = 1
            lr_sch = mx.lr_scheduler.PolyScheduler(
                max_update=global_train_steps * self.epochs *
                max_update_factor,
                base_lr=lr,
                pwr=2,
                warmup_steps=global_train_steps)
            opt = mx.gluon.Trainer(model.collect_params(),
                                   opt_option[action[3] - 1],
                                   {'lr_scheduler': lr_sch})
        self.logger(action=self.actions)
        model_structure = deepcopy(self.actions)
        try:
            train_loader, val_loader, test_loader = self.data[batch_size]
            if self.mode == 'search' or self.mode == 'train':
                # train
                train_time = 0.
                best_mae = float('inf')
                best_epoch = 0
                best_test_mae = float('inf')
                best_test_res = None
                for epoch in range(self.config['epochs']):
                    loss_value = 0
                    mae = 0
                    rmse = 0
                    mape = 0
                    train_batch_num = 0
                    for X in train_loader:
                        y = X.label[0]
                        X = X.data[0]
                        train_batch_num += 1
                        X, y = X.as_in_context(self.ctx), y.as_in_context(
                            self.ctx)
                        with autograd.record():
                            y = y.astype('float32')
                            start_time = time()
                            output = model(X)
                            train_time += time() - start_time
                            l = loss(output, y)
                        # if self.test:
                        #     return
                        l.backward()
                        opt.step(batch_size)
                        loss_value += loss(output, y).mean().asscalar()
                        mae += MAE(y, output)
                        rmse += RMSE(y, output)
                        mape += masked_mape_np(y, output)
                    train_loader.reset()
                    loss_value /= train_batch_num
                    mae /= train_batch_num
                    rmse /= train_batch_num
                    mape /= train_batch_num
                    self.logger(
                        train=[epoch, loss_value, mae, mape, rmse, train_time])
                    print(f"    epoch:{epoch} ,loss:{loss_value}")
                    if self.mode == 'train':
                        eval_loss_value, mae, rmse, mape, val_time = self.eval_model(
                            val_loader, model, loss)
                        self.logger(
                            eval=[eval_loss_value, mae, mape, rmse, val_time])
                        self.logger.save_GNN(model, model_structure, mae)
                        if mae < best_mae:
                            best_mae = mae
                            best_epoch = epoch
                        if epoch - best_epoch > 10:
                            print(f'early stop at epoch:{epoch}')
                            break
                        mae, mape, rmse, test_time = self.test_model_without_load(
                            test_loader, model, loss)
                        if mae < best_test_mae:
                            best_test_mae = mae
                            best_test_res = [mae, mape, rmse, test_time]
                        print(f'test_res:{best_test_res}')
            if self.mode == 'search':
                eval_loss_value, mae, rmse, mape, val_time = self.eval_model(
                    val_loader, model, loss)
                # get reward
                if self.time_max - val_time > 0:
                    reward = -mae / 10 + np.power(
                        np.e, -5) * np.log2(self.time_max - val_time)
                else:
                    reward = -10
                if np.isnan(reward) or np.isinf(reward) or reward < -100:
                    self.logger.append_log_file(f"Warning: reward={reward}")
                    reward = -10
                self.logger(eval=[eval_loss_value, mae, mape, rmse, val_time])
                self.logger.save_GNN(model, model_structure,
                                     reward / len(self.action_trajectory) + 1)
                return reward, False
            elif self.mode == 'train':
                self.logger.append_log_file(f'best_test_res:{best_test_res}')
                mae, mape, rmse, test_time = self.test_model(test_loader, loss)
                return best_test_res, [mae, mape, rmse, test_time]
            elif self.mode == 'test':
                mae, mape, rmse, test_time = self.test_model(test_loader, loss)
                return None, [mae, mape, rmse, test_time]

        except Exception as e:
            self.logger.append_log_file(e.args[0])
            self.logger(train=None, eval=None, test=None)
            traceback.print_exc()
            return -10, True