Ejemplo n.º 1
0
 def test_net_sync(net, criterion, sync, nDevices):
     ctx_list = [mx.cpu(0) for i in range(nDevices)]
     net = DataParallelModel(net, ctx_list, sync=sync)
     criterion = DataParallelCriterion(criterion, ctx_list, sync=sync)
     iters = 10
     bs = 2
     # train mode
     for i in range(iters):
         x = mx.random.uniform(shape=(bs, 1, 28, 28))
         t = nd.ones(shape=(bs))
         with autograd.record():
             y = net(x)
             loss = criterion(y, t)
             autograd.backward(loss)
     # evaluation mode
     for i in range(iters):
         x = mx.random.uniform(shape=(bs, 1, 28, 28))
         y = net(x)
     nd.waitall()
Ejemplo n.º 2
0
def train(num_gpus, batch_size, lr):
    train_data, test_data = utils.load_data_fashion_mnist(batch_size)

    ctx = [gpu[i] for i in range(num_gpus)]
    print('running on', ctx)

    dev_params = [get_params(params, c) for c in ctx]

    for epoch in range(5):
        start = time()
        for data, label in train_data:
            train_batch(data, label, dev_params, ctx, lr)
        nd.waitall()
        print('Epoch: %d, training time = %.1f sec'%(epoch, time() - start))

        # valiting on GPU 0
        net = lambda data : lenet(data, dev_params[0])
        test_acc = utils.evaluate_accuracy(test_data, net, ctx[0])
        print('Validataion Accuracy = %.4f'%(test_acc))
def main():
    os.environ[
        'MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '1'  #use mxnet autotune cudnn for about 2x speed up
    #load model and params
    sym, arg_params, aux_params = mx.model.load_checkpoint(
        'models/mobilenet0.25_yolo3_final', 0)
    executor = sym.simple_bind(ctx=mx.gpu(0),
                               data=(1, 3, 320, 320),
                               grad_req='null',
                               force_rebind=True)
    executor.copy_params_from(arg_params, aux_params)
    #warm up for search the best config in cudnn
    print("warm up for cudnn config......")
    a = executor.forward(is_train=False, data=mx.nd.zeros((1, 3, 320, 320)))
    nd.waitall()
    print("start!")

    cva = MyDetector(executor)
    cva.Run()
Ejemplo n.º 4
0
    def __call__(self, param):
        """Callback to Show speed
        """
        count = param.num_update

        if self.last_count > count:
            self.init = False
        self.last_count = count

        self.loss_metric.update(param.loss[0])

        if self.init:
            if count % self.frequent == 0:
                nd.waitall()
                try:
                    speed = self.frequent * self.batch_size / (time.time() -
                                                               self.tic)
                    speed_total = speed * self.size
                except ZeroDivisionError:
                    speed = float('inf')
                    speed_total = float('inf')

                # summary loss
                loss_scalar = self.loss_metric.get()
                self.summary_writer.add_scalar(tag="loss",
                                               value=loss_scalar,
                                               global_step=param.num_update)
                loss_str_format = "[%d][%s]:%.2f " % (param.num_epoch, "loss",
                                                      loss_scalar)
                self.loss_metric.reset()
                # summary speed
                self.summary_writer.add_scalar(tag="speed",
                                               value=speed,
                                               global_step=param.num_update)
                self.summary_writer.flush()
                if self.rank == 0:
                    logging.info(
                        "Iter:%d Rank:%.2f it/sec Total:%.2f it/sec %s",
                        param.num_update, speed, speed_total, loss_str_format)
                self.tic = time.time()
        else:
            self.init = True
            self.tic = time.time()
Ejemplo n.º 5
0
def train(num_gpus, batch_size, lr):
    train_iter, test_iter = gb.load_data_fashion_mnist(
        batch_size, root="../data/fashion-mnist")
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    print("running on:", ctx)
    gpu_params = [get_params(params, c) for c in ctx]
    for epoch in range(4):
        start = time.time()
        for X, y in train_iter:
            train_batch(X, y, gpu_params, ctx, lr)
            nd.waitall()
        train_time = time.time() - start

        def net(x):
            return lenet(x, gpu_params[0])

        test_acc = gb.evaluate_accuracy(test_iter, net, ctx[0])
        print("epoch %d, time: %.1f sec, test acc: %.2f" %
              (epoch + 1, train_time, test_acc))
Ejemplo n.º 6
0
    def speed(net, ctx, data_size=(1024, 1024), iterations=1000, warm_up=500):
        net.hybridize(static_alloc=True)
        sample = EvalFactory._sample(data_size, ctx[0])

        logger.info(f'Warm-up starts for {warm_up} forward passes...')
        for _ in range(warm_up):
            with autograd.record(False):
                net.predict(sample)
        nd.waitall()

        logger.info(f'Evaluate inference speed for {iterations} forward passes...')
        start = time.time()
        for _ in range(iterations):
            with autograd.record(False):
                net.predict(sample)
        nd.waitall()
        time_cost = time.time() - start

        logger.info('Total time: %.2fs, latency: %.2fms, FPS: %.1f'
                    % (time_cost, time_cost / iterations * 1000, iterations / time_cost))
Ejemplo n.º 7
0
    def process(self, images, return_time=False):
        output = self.model(images)[-1]
        output['hm'] = output['hm'].sigmoid()
        output['dep'] = 1. / (output['dep'].sigmoid() + 1e-6) - 1.
        wh = output['wh'] if self.opt.reg_bbox else None
        reg = output['reg'] if self.opt.reg_offset else None

        nd.waitall()
        forward_time = time.time()
        dets = decode_centernet_3dod(output['hm'],
                                     output['rot'],
                                     output['dep'],
                                     output['dim'],
                                     wh=wh,
                                     reg=reg,
                                     K=self.opt.K)
        if return_time:
            return output, dets, forward_time
        else:
            return output, dets
Ejemplo n.º 8
0
def train(num_gpus, batch_size, lr):
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    print('running on:', ctx)
    # 将模型参数复制到num_gpus块显卡的显存上
    gpu_params = [get_params(params, c) for c in ctx]
    for epoch in range(4):
        start = time.time()
        for X, y in train_iter:
            # 对单个小批量进行多GPU训练
            train_batch(X, y, gpu_params, ctx, lr)
            nd.waitall()
        train_time = time.time() - start

        def net(x):  # 在gpu(0)上验证模型
            return lenet(x, gpu_params[0])

        test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0])
        print('epoch %d, time %.1f sec, test acc %.2f' %
              (epoch + 1, train_time, test_acc))
Ejemplo n.º 9
0
    def train(self, need_test=False):
        print("Training process starts from epoch {}...".format(
            self.resume_epoch))
        for epoch in range(self.resume_epoch, self.epochs):
            self.current_epoch = epoch
            for _, item in enumerate(self.train_loader):
                inputs, labels = item
                inputs = inputs.as_in_context(self.ctx)
                labels = labels.as_in_context(self.ctx)
                cls = 0.0
                with autograd.record():  # Gradient
                    outputs = self.model(inputs)
                    for _, loss_type in self.loss_functions.items():
                        cls += loss_type(outputs=outputs,
                                         labels=labels,
                                         train_total=self.train_total)
                cls.backward()

                self.train_total += inputs.shape[0]
                self.trainer.step(batch_size=inputs.shape[0])

                cls = nd.array(cls).asscalar()
                if self.train_total % self.args.steps_per_log == 0:
                    self.trainer_log.print_batch_log(
                        current_lr=self.lr_scheduler.base_lr,
                        current_epoch=self.current_epoch,
                        epochs=self.epochs,
                        train_total=self.train_total,
                        loss=cls,
                    )

            nd.waitall()
            if (epoch + 1) % self.args.epochs_per_val == 0:
                if need_test is True:
                    self.test()
                self.best_accuracy = self.check_point.save_checkpoint_parameters(
                    epoch=self.current_epoch,
                    model=self.model,
                    current_accuracy=self.current_accuracy,
                    best_accuracy=self.best_accuracy)
        self.trainer_log.log_close()
Ejemplo n.º 10
0
    def speed(self, iterations=1000, warm_up=500):
        """speed test with hybridized HybridBlock"""
        self.net.hybridize(static_alloc=True)

        # warm-up to obtain stable speed
        print("Warm-up for %d forward passes..." % warm_up)
        for _ in range(warm_up):
            with autograd.record(False):
                self.net.predict(self.sample)
            nd.waitall()

        # speed test
        print("Speed test for %d forward passes..." % iterations)
        t_start = time.time()
        for _ in range(iterations):
            with autograd.record(False):
                self.net.predict(self.sample)
        nd.waitall()
        time_cost = time.time() - t_start
        return time_cost, (time_cost / iterations
                           ) * 1000 / self.bs, iterations * self.bs / time_cost
Ejemplo n.º 11
0
def train(X, contents_Y, styles_Y, ctx, lr, max_epochs, lr_decay_epoch):
    X, styles_Y_gram, trainer = get_inits(X, ctx, lr, styles_Y)
    for i in range(max_epochs):
        start = time.time()
        with autograd.record():
            contents_Y_hat, styles_Y_hat = extract_features(
                X, content_layers, style_layers)
            contents_l, styles_l, tv_l, l = compute_loss(
                X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram)
        l.backward()
        trainer.step(1)
        nd.waitall()
        if i % 50 == 0 and i != 0:
            print('epoch %3d, content loss %.2f, style loss %.2f, '
                  'TV loss %.2f, %.2f sec' %
                  (i, nd.add_n(*contents_l).asscalar(),
                   nd.add_n(*styles_l).asscalar(), tv_l.asscalar(),
                   time.time() - start))
        if i % lr_decay_epoch == 0 and i != 0:
            trainer.set_learning_rate(trainer.learning_rate * 0.1)
            print('change lr to %.1e' % trainer.learning_rate)
    return X
Ejemplo n.º 12
0
    def process(self, images, return_time=False):
        output = self.model(images)[-1]
        output['hm'] = output['hm'].sigmoid()

        if self.opt.hm_hp and not self.opt.mse_loss:
            output['hm_hp'] = output['hm_hp'].sigmoid()

        reg = output['reg'] if self.opt.reg_offset else None
        hm_hp = output['hm_hp'] if self.opt.hm_hp else None
        hp_offset = output['hp_offset'] if self.opt.reg_hp_offset else None

        nd.waitall()
        forward_time = time.time()

        if self.opt.flip_test:
            output['hm'] = (output['hm'][0:1] +
                            flip_tensor(output['hm'][1:2])) / 2
            output['wh'] = (output['wh'][0:1] +
                            flip_tensor(output['wh'][1:2])) / 2
            output['hps'] = (output['hps'][0:1] + flip_lr_off(
                output['hps'][1:2], self.flip_idx)) / 2
            hm_hp = (hm_hp[0:1] + flip_lr(hm_hp[1:2], self.flip_idx)
                     ) / 2 if hm_hp is not None else None
            reg = reg[0:1] if reg is not None else None
            hp_offset = hp_offset[0:1] if hp_offset is not None else None

        dets = decode_centernet_pose(output['hm'],
                                     output['wh'],
                                     output['hps'],
                                     reg=reg,
                                     hm_hp=hm_hp,
                                     hp_offset=hp_offset,
                                     K=self.opt.K)

        if return_time:
            return output, dets, forward_time
        else:
            return output, dets
Ejemplo n.º 13
0
    def train_gpu(self):
        train_data, valid_data = self.load_data()  # 训练和测试数据
        ctx = self.model_ctx_gpu
        print('Running on {}'.format(ctx))

        net = self.model()  # 模型
        net.collect_params().initialize(init=mx.init.Normal(sigma=.1), ctx=ctx)

        smoothing_constant = .01
        trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': smoothing_constant})

        epochs = 10
        for e in range(epochs):
            start = time()
            for batch in train_data:
                self.train_batch(batch, ctx, net, trainer)
            nd.waitall()  # 等待所有异步的任务都终止
            print('Epoch %d, training time = %.1f sec' % (e, time() - start))
            correct, num = 0.0, 0.0
            for batch in valid_data:
                correct += self.valid_batch(batch, ctx, net)
                num += batch[0].shape[0]
            print('\tvalidation accuracy = %.4f' % (correct / num))
 def train(self, video_dataset):
     # Get logger
     train_logger = get_logger(self.cfg.LOG_DIR, 'train_vis_repr_model_%s' % video_dataset.dataset_info)
     params_file, params_select = ['vis_repr_model_%s' % video_dataset.dataset_info], ['vis_repr_model']
     # Init params
     self.load(params_file, params_select, train_logger, allow_init=True)
     # 1. Select params to train
     model_trainer = trainer.Trainer(self._collect_params(params_select), 'adam',
                                     {'wd': 5e-4, 'learning_rate': self.cfg.LR_INIT})
     # 2. Train each epoch
     for e in range(self.cfg.EPOCHS_TRAIN_MAIN):
         # Train each batch
         batch_index = 0
         while True:
             # 1. Load data
             (batch_images, batch_labels, _), finish = video_dataset.get_batch_data_cls(batch_index, self.cfg.BATCH_SIZE_TRAIN_MAIN)
             x_list, y = utils.io.split_and_load_gpu(self.cfg.CTX, [batch_images], batch_labels)
             # 2. Record calculation
             with autograd.record():
                 pred_y = self.feedforward(x_list)
                 loss_value = utils.loss.loss_mse(pred_y, y)
             # 3. Backward & update
             loss_value.backward()
             nd.waitall()
             model_trainer.step(batch_size=self.cfg.BATCH_SIZE_TRAIN_MAIN)
             # Show info
             train_logger.info(self.get_loss_info(
                 'Train vis_repr_model - ', e, batch_index, video_dataset.num_data / self.cfg.BATCH_SIZE_TRAIN_MAIN,
                 loss_value))
             # Move to next
             if finish: break
             else: batch_index += 1
         # Schedules
         self._step_update_learning_rate(e, model_trainer)
         self._step_save_params(e, params_file, params_select)
     # 3. Finish
     train_logger.info("Training accomplished. ")
Ejemplo n.º 15
0
def evaluate(model,
             g,
             features,
             labels,
             mask,
             ctx,
             batch_size,
             mini_batch=True):
    f1 = mx.metric.F1()
    preds = []
    batch_size = batch_size if mini_batch else features.shape[0]
    dataloader = gluon.data.BatchSampler(
        gluon.data.SequentialSampler(features.shape[0]), batch_size, 'keep')
    for batch in dataloader:
        node_flow, batch_nids = g.sample_block(nd.array(batch).astype('int64'))
        preds.append(model(node_flow, features[batch_nids.as_in_context(ctx)]))
        nd.waitall()

    # preds = nd.concat(*preds, dim=0).argmax(axis=1)
    preds = nd.concat(*preds, dim=0)
    mask = nd.array(np.where(mask.asnumpy()), ctx=ctx)
    f1.update(preds=nd.softmax(preds[mask], axis=1).reshape(-3, 0),
              labels=labels[mask].reshape(-1, ))
    return f1.get()[1]
Ejemplo n.º 16
0
def train(params, max_epochs, lr, lr_decay_epoch=200):
    tic = time()
    trainer = gluon.Trainer(params, 'sgd', {'learning_rate': lr})

    for i in range(max_epochs):
        x = params.get('generated_image')
        with autograd.record():
            content_py, style_py = extract_features(x.data(), content_layers,
                                                    style_layers)
            content_L = sum_loss(content_loss, content_py, content_y,
                                 content_weights)
            style_L = sum_loss(style_loss, style_py, style_y, style_weights)

            #             tv_L = tv_loss(x.data())

            loss = content_L + 500 * style_L

        loss.backward()
        trainer.step(1)

        # add sync to avoid large mem usage
        nd.waitall()

        if i % 40 == 0:
            #             print('epoch %3d, content %.3f, style %.3f, tv %.3f, time %.1f sec' % (
            #                 i, content_L.asscalar(), style_L.asscalar(), tv_L.asscalar(), time()-tic))
            print('epoch %3d, content %.3f, style %.3f, time %.1f sec' %
                  (i, content_L.asscalar(), style_L.asscalar(), time() - tic))
            tic = time()

        if i and i % lr_decay_epoch == 0:
            lr *= 0.5
            trainer.set_learning_rate(lr)
            print('change lr to ', lr)

    return params
Ejemplo n.º 17
0
def train(x, max_epochs, lr, lr_decay_epoch=200):
    tic = time()
    for i in range(max_epochs):
        with autograd.record():
            content_py, style_py = extract_features(x, content_layers,
                                                    style_layers)
            content_L = sum_loss(content_loss, content_py, content_y,
                                 content_weights)
            style_L = sum_loss(style_loss, style_py, style_y, style_weights)
            tv_L = tv_weight * tv_loss(x)
            loss = style_L + content_L + tv_L

        loss.backward()
        x.grad[:] /= x.grad.abs().mean() + 1e-8
        x[:] -= lr * x.grad
        # add sync to avoid large mem usage
        nd.waitall()

        if i and i % 50 == 0:
            print(
                'batch %3d, content %.2f, style %.2f, '
                'TV %.2f, time %.1f sec' %
                (i, content_L.asscalar(), style_L.asscalar(), tv_L.asscalar(),
                 time() - tic))
            tic = time()
            canvas, img = postprocess(x)
            cv2.imwrite('result_%d.jpg' % i, img)

        if i and i % lr_decay_epoch == 0:
            lr *= 0.1
            print('change lr to ', lr)
    canvas, img = postprocess(x)
    cv2.imwrite('result.jpg', img)
    #plt.imshow(canvas.asnumpy())
    #plt.show()
    return x
Ejemplo n.º 18
0
def train(x, max_epochs, lr, lr_decay_epoch=200):
    tic = time()
    for i in range(max_epochs):
        with autograd.record():
            content_py, style_py = extract_features(
                x, content_layers, style_layers)
            content_L  = sum_loss(
                content_loss, content_py, content_y, content_weights)
            style_L = sum_loss(
                style_loss, style_py, style_y, style_weights)
            tv_L = tv_weight * tv_loss(x)
            loss = style_L + content_L + tv_L

        loss.backward()
        x.grad[:] /= x.grad.abs().mean()+1e-8
        x[:] -= lr * x.grad
        # add sync to avoid large mem usage
        nd.waitall()

        if i and i % 50 == 0:
            print('batch %3d, content %.2f, style %.2f, '
                  'TV %.2f, time %.1f sec' % (
                i, content_L.asscalar(), style_L.asscalar(),
                tv_L.asscalar(), time()-tic))
            tic = time()
            canvas,img = postprocess(x)
            cv2.imwrite('result_%d.jpg'%i,img)

        if i and i % lr_decay_epoch == 0:
            lr *= 0.1
            print('change lr to ', lr)
    canvas,img = postprocess(x)
    cv2.imwrite('result.jpg',img)
    #plt.imshow(canvas.asnumpy())
    #plt.show()
    return x
Ejemplo n.º 19
0
def train_one_epoch(model, data_loader, trainer, loss_function, ema=None):
    r"""
    One train loop.
    """
    total_batchs = data_loader.total_batchs
    total_loss = 0
    step = 0
    global global_step
    for batch_data in data_loader.next_batch():
        step += 1
        global_step += 1
        # add evaluate per EVALUATE_INTERVAL batchs
        if global_step % EVALUATE_INTERVAL == 0:
            print('global_step == %d' % (global_step))
            print('evaluating dev dataset...')
            f1_score, em_score = evaluate(model, dataset_type='dev', ema=ema)
            print('dev f1:' + str(f1_score) + 'em:' + str(em_score))
            dev_f1.append([global_step, f1_score])
            dev_em.append([global_step, em_score])

        context = nd.array([x[0] for x in batch_data])
        query = nd.array([x[1] for x in batch_data])
        c_mask = context > 0
        q_mask = query > 0
        context_char = nd.array([x[2] for x in batch_data])
        query_char = nd.array([x[3] for x in batch_data])
        begin = nd.array([x[4] for x in batch_data])
        end = nd.array([x[5] for x in batch_data])
        batch_sizes = context.shape[0]
        context = gluon.utils.split_and_load(data=context, ctx_list=CTX)
        c_mask = gluon.utils.split_and_load(data=c_mask, ctx_list=CTX)

        query = gluon.utils.split_and_load(data=query, ctx_list=CTX)
        q_mask = gluon.utils.split_and_load(data=q_mask, ctx_list=CTX)
        context_char = gluon.utils.split_and_load(data=context_char,
                                                  ctx_list=CTX)
        query_char = gluon.utils.split_and_load(data=query_char, ctx_list=CTX)
        begin = gluon.utils.split_and_load(data=begin, ctx_list=CTX)
        end = gluon.utils.split_and_load(data=end, ctx_list=CTX)

        with autograd.record():
            different_ctx_loss = [
                loss_function(*model(c, q, cc, qc, cm, qm, b, e)) for c, q, cc,
                qc, cm, qm, b, e in zip(context, query, context_char,
                                        query_char, c_mask, q_mask, begin, end)
            ]

            for loss in different_ctx_loss:
                loss.backward()
        if global_step == 1:
            for name, param in model.collect_params().items():
                ema.add(name, param.data(CTX[0]))
        trainer.set_learning_rate(warm_up_lr(global_step))
        trainer.allreduce_grads()
        reset_embedding_grad(model)
        tmp = []
        for name, paramater in model.collect_params().items():
            grad = paramater.grad(context[0].context)
            if name == 'qanet0_embedding0_weight':
                grad[0:2] += WEIGHT_DECAY * \
                    paramater.data(context[0].context)[0:2]
            else:
                grad += WEIGHT_DECAY * paramater.data(context[0].context)
            tmp.append(grad)
        gluon.utils.clip_global_norm(tmp, CLIP_GRADIENT)
        reset_embedding_grad(model)
        trainer.update(batch_sizes, ignore_stale_grad=True)
        for name, param in model.collect_params().items():
            ema(name, param.data(CTX[0]))

        batch_loss = .0
        for loss in different_ctx_loss:
            batch_loss += loss.mean().asscalar()
        batch_loss /= len(different_ctx_loss)
        total_loss += batch_loss

        batch_train_ce.append([global_step, batch_loss])
        accum_avg_train_ce.append([global_step, total_loss / step])

        print('batch %d/%d, total_loss %.2f, batch_loss %.2f' %
              (step, total_batchs, total_loss / step, batch_loss),
              end='\r',
              flush=True)
        nd.waitall()
    def train(self, batch_size=64,
              num_epoch=10,
              eval_metric='acc',
              eval_metric_params={},
              eval_train=False,
              loss ='softmax_cross_entropy',
              loss_params={},
              optimizer='adam',
              optimizer_params=(('learning_rate', 0.001),),
              load_checkpoint=True,
              checkpoint_period=5,
              load_pretrained=False,
              log_period=50,
              context='gpu',
              save_attention_image=False,
              use_teacher_forcing=False,
              normalize=True,
              shuffle_data=False,
              clip_global_grad_norm=None,
              preprocessing=False,
              onnx_export=False):
        num_pus = 1
        if context == 'gpu':
            num_pus = mx.context.num_gpus()
            if num_pus >= 1:
                if num_pus == 1:
                    mx_context = [mx.gpu(0)]
                else:
                    mx_context = [mx.gpu(i) for i in range(num_pus)]
            else:
                logging.error("Context argument is '" + context + "'. But no gpu is present in the system.")
        elif context == 'cpu':
            mx_context = [mx.cpu()]
        else:
            logging.error("Context argument is '" + context + "'. Only 'cpu' and 'gpu are valid arguments'.")
        single_pu_batch_size = int(batch_size/num_pus)

        if preprocessing:
            preproc_lib = "CNNPreprocessor_ResNeXt50_executor"
            train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_preprocessed_data(batch_size, preproc_lib, shuffle_data)
        else:
            train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_data(batch_size, shuffle_data)

        if 'weight_decay' in optimizer_params:
            optimizer_params['wd'] = optimizer_params['weight_decay']
            del optimizer_params['weight_decay']
        if 'learning_rate_decay' in optimizer_params:
            min_learning_rate = 1e-08
            if 'learning_rate_minimum' in optimizer_params:
                min_learning_rate = optimizer_params['learning_rate_minimum']
                del optimizer_params['learning_rate_minimum']
            optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
                                                   optimizer_params['step_size'],
                                                   factor=optimizer_params['learning_rate_decay'],
                                                   stop_factor_lr=min_learning_rate)
            del optimizer_params['step_size']
            del optimizer_params['learning_rate_decay']

        if normalize:
            self._net_creator.construct(context=mx_context, batch_size=batch_size, data_mean=data_mean, data_std=data_std)
        else:
            self._net_creator.construct(context=mx_context, batch_size=batch_size)

        begin_epoch = 0
        if load_checkpoint:
            begin_epoch = self._net_creator.load(mx_context)
        elif load_pretrained:
            self._net_creator.load_pretrained_weights(mx_context)
        else:
            if os.path.isdir(self._net_creator._model_dir_):
                shutil.rmtree(self._net_creator._model_dir_)

        self._networks = self._net_creator.networks

        try:
            os.makedirs(self._net_creator._model_dir_)
        except OSError:
            if not os.path.isdir(self._net_creator._model_dir_):
                raise

        if optimizer == "adamw":
            trainers = [mx.gluon.Trainer(network.collect_params(), AdamW.AdamW(**optimizer_params)) for network in self._networks.values() if len(network.collect_params().values()) != 0]
        else:
            trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in self._networks.values() if len(network.collect_params().values()) != 0]

        margin = loss_params['margin'] if 'margin' in loss_params else 1.0
        sparseLabel = loss_params['sparse_label'] if 'sparse_label' in loss_params else True
        ignore_indices = [loss_params['ignore_indices']] if 'ignore_indices' in loss_params else []
        loss_axis = loss_params['loss_axis'] if 'loss_axis' in loss_params else -1
        batch_axis = loss_params['batch_axis'] if 'batch_axis' in loss_params else 0
        if loss == 'softmax_cross_entropy':
            fromLogits = loss_params['from_logits'] if 'from_logits' in loss_params else False
            loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss(axis=loss_axis, from_logits=fromLogits, sparse_label=sparseLabel, batch_axis=batch_axis)
        elif loss == 'softmax_cross_entropy_ignore_indices':
            fromLogits = loss_params['from_logits'] if 'from_logits' in loss_params else False
            loss_function = SoftmaxCrossEntropyLossIgnoreIndices(axis=loss_axis, ignore_indices=ignore_indices, from_logits=fromLogits, sparse_label=sparseLabel, batch_axis=batch_axis)
        elif loss == 'sigmoid_binary_cross_entropy':
            loss_function = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
        elif loss == 'cross_entropy':
            loss_function = CrossEntropyLoss(axis=loss_axis, sparse_label=sparseLabel, batch_axis=batch_axis)
        elif loss == 'dice_loss':
            loss_weight = loss_params['loss_weight'] if 'loss_weight' in loss_params else None
            loss_function = DiceLoss(axis=loss_axis, weight=loss_weight, sparse_label=sparseLabel, batch_axis=batch_axis)
        elif loss == 'softmax_cross_entropy_ignore_label':
            loss_weight = loss_params['loss_weight'] if 'loss_weight' in loss_params else None
            loss_ignore_label = loss_params['loss_ignore_label'] if 'loss_ignore_label' in loss_params else None
            loss_function = SoftmaxCrossEntropyLossIgnoreLabel(axis=loss_axis, ignore_label=loss_ignore_label, weight=loss_weight, batch_axis=batch_axis)
        elif loss == 'l2':
            loss_function = mx.gluon.loss.L2Loss()
        elif loss == 'l1':
            loss_function = mx.gluon.loss.L1Loss()
        elif loss == 'huber':
            rho = loss_params['rho'] if 'rho' in loss_params else 1
            loss_function = mx.gluon.loss.HuberLoss(rho=rho)
        elif loss == 'hinge':
            loss_function = mx.gluon.loss.HingeLoss(margin=margin)
        elif loss == 'squared_hinge':
            loss_function = mx.gluon.loss.SquaredHingeLoss(margin=margin)
        elif loss == 'logistic':
            labelFormat = loss_params['label_format'] if 'label_format' in loss_params else 'signed'
            loss_function = mx.gluon.loss.LogisticLoss(label_format=labelFormat)
        elif loss == 'kullback_leibler':
            fromLogits = loss_params['from_logits'] if 'from_logits' in loss_params else True
            loss_function = mx.gluon.loss.KLDivLoss(from_logits=fromLogits)
        elif loss == 'log_cosh':
            loss_function = LogCoshLoss()
        else:
            logging.error("Invalid loss parameter.")

        loss_function.hybridize()


        tic = None

        avg_speed = 0
        n = 0
    
        for epoch in range(begin_epoch, begin_epoch + num_epoch):
            if shuffle_data:
                if preprocessing:
                    preproc_lib = "CNNPreprocessor_ResNeXt50_executor"
                    train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_preprocessed_data(batch_size, preproc_lib, shuffle_data)
                else:
                    train_iter, test_iter, data_mean, data_std, train_images, test_images = self._data_loader.load_data(batch_size, shuffle_data)

            global_loss_train = 0.0
            train_batches = 0

            loss_total = 0
            train_iter.reset()
            for batch_i, batch in enumerate(train_iter):
                
                                 
                with autograd.record():
                    labels = [gluon.utils.split_and_load(batch.label[i], ctx_list=mx_context, even_split=False) for i in range(1)]
                    data_ = gluon.utils.split_and_load(batch.data[0], ctx_list=mx_context, even_split=False)

                    predictions_ = [mx.nd.zeros((single_pu_batch_size, 1000,), ctx=context) for context in mx_context]


                    nd.waitall()
                    lossList = []
                    for i in range(num_pus):
                        lossList.append([])

                    net_ret = [self._networks[0](data_[i]) for i in range(num_pus)]
                    predictions_ = [net_ret[i][0][0] for i in range(num_pus)]
                    [lossList[i].append(loss_function(predictions_[i], labels[0][i])) for i in range(num_pus)]


                    losses = [0]*num_pus
                    for i in range(num_pus):
                        for element in lossList[i]:
                            losses[i] = losses[i] + element

                for loss in losses: 
                    loss.backward()
                    loss_total += loss.sum().asscalar()
                    global_loss_train += loss.sum().asscalar()

                train_batches += 1

                if clip_global_grad_norm:
                    grads = []

                    for network in self._networks.values():
                        grads.extend([param.grad(mx_context) for param in network.collect_params().values()])

                    gluon.utils.clip_global_norm(grads, clip_global_grad_norm)

                for trainer in trainers:
                    trainer.step(batch_size)
    
                if tic is None:
                    tic = time.time()
                else:
                    if batch_i % log_period == 0:
                        try:
                            speed = log_period * batch_size / (time.time() - tic)
                        except ZeroDivisionError:
                            speed = float("inf")

                        loss_avg = loss_total / (batch_size * log_period)
                        loss_total = 0

                        logging.info("Epoch[%d] Batch[%d] Speed: %.2f samples/sec Loss: %.5f" % (epoch, batch_i, speed, loss_avg))
                        
                        avg_speed += speed
                        n += 1
    
                        tic = time.time()

            global_loss_train /= (train_batches * batch_size)

            tic = None
    
    
            if eval_train:
                train_iter.batch_size = single_pu_batch_size
                train_iter.reset()
                metric = mx.metric.create(eval_metric, **eval_metric_params)
                for batch_i, batch in enumerate(train_iter):

                    labels = [batch.label[i].as_in_context(mx_context[0]) for i in range(1)]
                    data_ = batch.data[0].as_in_context(mx_context[0])

                    predictions_ = mx.nd.zeros((single_pu_batch_size, 1000,), ctx=mx_context[0])


                    nd.waitall()

                    lossList = []
                    outputs = []
                    attentionList = []

                    net_ret = self._networks[0](data_)
                    predictions_ = net_ret[0][0]
                    outputs.append(predictions_)
                    lossList.append(loss_function(predictions_, labels[0]))
    
                    if save_attention_image == "True":
                        import matplotlib
                        matplotlib.use('Agg')
                        import matplotlib.pyplot as plt
                        logging.getLogger('matplotlib').setLevel(logging.ERROR)

                        if(os.path.isfile('src/test/resources/training_data/Show_attend_tell/dict.pkl')):
                            with open('src/test/resources/training_data/Show_attend_tell/dict.pkl', 'rb') as f:
                                dict = pickle.load(f)

                        plt.clf()
                        fig = plt.figure(figsize=(15,15))
                        max_length = len(labels)-1

                        ax = fig.add_subplot(max_length//3, max_length//4, 1)
                        ax.imshow(train_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0))

                        for l in range(max_length):
                            attention = attentionList[l]
                            attention = mx.nd.slice_axis(attention, axis=0, begin=0, end=1).squeeze()
                            attention_resized = np.resize(attention.asnumpy(), (8, 8))
                            ax = fig.add_subplot(max_length//3, max_length//4, l+2)
                            if int(labels[l+1][0].asscalar()) > len(dict):
                                ax.set_title("<unk>")
                            elif dict[int(labels[l+1][0].asscalar())] == "<end>":
                                ax.set_title(".")
                                img = ax.imshow(train_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0))
                                ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent())
                                break
                            else:
                                ax.set_title(dict[int(labels[l+1][0].asscalar())])
                            img = ax.imshow(train_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0))
                            ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent())

                        plt.tight_layout()
                        target_dir = 'target/attention_images'
                        if not os.path.exists(target_dir):
                            os.makedirs(target_dir)
                        plt.savefig(target_dir + '/attention_train.png')
                        plt.close()

                    predictions = []
                    for output_name in outputs:
                        if mx.nd.shape_array(mx.nd.squeeze(output_name)).size > 1:
                            predictions.append(mx.nd.argmax(output_name, axis=1))
                        else:
                            predictions.append(output_name)

                    metric.update(preds=predictions, labels=[labels[j] for j in range(len(labels))])

                train_metric_score = metric.get()[1]
            else:
                train_metric_score = 0

            global_loss_test = 0.0
            test_batches = 0
    
            test_iter.batch_size = single_pu_batch_size
            test_iter.reset()
            metric = mx.metric.create(eval_metric, **eval_metric_params)
            for batch_i, batch in enumerate(test_iter):
                if True: 
                                                   
                    labels = [batch.label[i].as_in_context(mx_context[0]) for i in range(1)]
                    data_ = batch.data[0].as_in_context(mx_context[0])

                    predictions_ = mx.nd.zeros((single_pu_batch_size, 1000,), ctx=mx_context[0])


                    nd.waitall()

                    lossList = []
                    outputs = []
                    attentionList = []

                    net_ret = self._networks[0](data_)
                    predictions_ = net_ret[0][0]
                    outputs.append(predictions_)
                    lossList.append(loss_function(predictions_, labels[0]))

                    if save_attention_image == "True":
                        if not eval_train:
                            import matplotlib
                            matplotlib.use('Agg')
                            import matplotlib.pyplot as plt
                            logging.getLogger('matplotlib').setLevel(logging.ERROR)

                            if(os.path.isfile('src/test/resources/training_data/Show_attend_tell/dict.pkl')):
                                with open('src/test/resources/training_data/Show_attend_tell/dict.pkl', 'rb') as f:
                                    dict = pickle.load(f)

                        plt.clf()
                        fig = plt.figure(figsize=(15,15))
                        max_length = len(labels)-1

                        ax = fig.add_subplot(max_length//3, max_length//4, 1)
                        ax.imshow(test_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0))

                        for l in range(max_length):
                            attention = attentionList[l]
                            attention = mx.nd.slice_axis(attention, axis=0, begin=0, end=1).squeeze()
                            attention_resized = np.resize(attention.asnumpy(), (8, 8))
                            ax = fig.add_subplot(max_length//3, max_length//4, l+2)
                            if int(mx.nd.slice_axis(outputs[l+1], axis=0, begin=0, end=1).squeeze().asscalar()) > len(dict):
                                ax.set_title("<unk>")
                            elif dict[int(mx.nd.slice_axis(outputs[l+1], axis=0, begin=0, end=1).squeeze().asscalar())] == "<end>":
                                ax.set_title(".")
                                img = ax.imshow(test_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0))
                                ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent())
                                break
                            else:
                                ax.set_title(dict[int(mx.nd.slice_axis(outputs[l+1], axis=0, begin=0, end=1).squeeze().asscalar())])
                            img = ax.imshow(test_images[0+single_pu_batch_size*(batch_i)].transpose(1,2,0))
                            ax.imshow(attention_resized, cmap='gray', alpha=0.6, extent=img.get_extent())

                        plt.tight_layout()
                        target_dir = 'target/attention_images'
                        if not os.path.exists(target_dir):
                            os.makedirs(target_dir)
                        plt.savefig(target_dir + '/attention_test.png')
                        plt.close()

                loss = 0
                for element in lossList:
                    loss = loss + element

                global_loss_test += loss.sum().asscalar()

                test_batches += 1

                predictions = []
                for output_name in outputs:
                    if mx.nd.shape_array(mx.nd.squeeze(output_name)).size > 1:
                        predictions.append(mx.nd.argmax(output_name, axis=1))
                    else:
                        predictions.append(output_name)

                metric.update(preds=predictions, labels=[labels[j] for j in range(len(labels))])

            global_loss_test /= (test_batches * single_pu_batch_size)
            test_metric_name = metric.get()[0]
            test_metric_score = metric.get()[1]

            metric_file = open(self._net_creator._model_dir_ + 'metric.txt', 'w')
            metric_file.write(test_metric_name + " " + str(test_metric_score))
            metric_file.close()

            logging.info("Epoch[%d] Train metric: %f, Test metric: %f, Train loss: %f, Test loss: %f" % (epoch, train_metric_score, test_metric_score, global_loss_train, global_loss_test))

            if (epoch+1) % checkpoint_period == 0:
                for i, network in self._networks.items():
                    network.save_parameters(self.parameter_path(i) + '-' + str(epoch).zfill(4) + '.params')

        for i, network in self._networks.items():
            network.save_parameters(self.parameter_path(i) + '-' + str((num_epoch-1) + begin_epoch).zfill(4) + '.params')
            network.export(self.parameter_path(i) + '_newest', epoch=0)

            if onnx_export:
                from mxnet.contrib import onnx as onnx_mxnet
                input_shapes = [(1,) + d.shape[1:] for _, d in test_iter.data]
                model_path = self.parameter_path(i) + '_newest'
                onnx_mxnet.export_model(model_path+'-symbol.json', model_path+'-0000.params', input_shapes, np.float32, model_path+'.onnx')

            loss_function.export(self.parameter_path(i) + '_newest_loss', epoch=0)
Ejemplo n.º 21
0
def train(cfg,
          ctx_lst,
          project_name,
          log_interval=5,
          no_val=False,
          lr=None,
          wd=None):
    wandb.init(job_type='train',
               dir=my_tools.root_dir(),
               config=cfg,
               project=project_name)
    if lr and wd:
        wandb.config.lr = lr
        wandb.config.wd = wd

    ctx = my_tools.get_contexts(ctx_lst)
    wandb.config.ctx = ctx

    data_factory = DataFactory(wandb.config.data_name)
    model_factory = ModelFactory(wandb.config.model_name)

    norm_layer, norm_kwargs = my_tools.get_norm_layer(wandb.config.norm,
                                                      len(ctx))
    model_kwargs = {
        'nclass': data_factory.num_class,
        'backbone': wandb.config.backbone,
        'pretrained_base': wandb.config.backbone_init.get('manner') == 'cls',
        'aux': wandb.config.aux,
        'crop_size': wandb.config.crop_size,
        'base_size': wandb.config.base_size,
        'dilate': wandb.config.dilate,
        'norm_layer': norm_layer,
        'norm_kwargs': norm_kwargs,
    }
    net = model_factory.get_model(
        model_kwargs,
        resume=wandb.config.resume,
        lr_mult=wandb.config.lr_mult,
        backbone_init_manner=wandb.config.backbone_init.get('manner'),
        backbone_ckpt=wandb.config.backbone_init.get('backbone_ckpt'),
        prior_classes=wandb.config.backbone_init.get('prior_classes'),
        ctx=ctx)
    if net.symbolize:
        net.hybridize()

    num_worker = 0 if platform.system() == 'Windows' else 16
    train_set = data_factory.seg_dataset(
        split='train',  # sometimes would be 'trainval'
        mode='train',
        transform=my_tools.image_transform(),
        base_size=wandb.config.base_size,
        crop_size=wandb.config.crop_size)
    train_iter = DataLoader(train_set,
                            wandb.config.bs_train,
                            shuffle=True,
                            last_batch='discard',
                            num_workers=num_worker)
    val_set = data_factory.seg_dataset(split='val',
                                       mode='val',
                                       transform=my_tools.image_transform(),
                                       base_size=wandb.config.base_size,
                                       crop_size=wandb.config.crop_size)
    val_iter = DataLoader(val_set,
                          wandb.config.bs_val,
                          shuffle=False,
                          last_batch='keep',
                          num_workers=num_worker)
    wandb.config.num_train = len(train_set)
    wandb.config.num_valid = len(val_set)

    criterion = _get_criterion(wandb.config.aux, wandb.config.aux_weight)
    criterion.initialize(ctx=ctx)
    wandb.config.criterion = type(criterion)

    if wandb.config.optimizer == 'adam':
        trainer = Trainer(net.collect_params(),
                          'adam',
                          optimizer_params={
                              'learning_rate': wandb.config.lr,
                              'wd': wandb.config.wd,
                              'beta1': wandb.config.adam.get('adam_beta1'),
                              'beta2': wandb.config.adam.get('adam_beta2')
                          })
    elif wandb.config.optimizer in ('sgd', 'nag'):
        scheduler = _lr_scheduler(
            mode=wandb.config.lr_scheduler,
            base_lr=wandb.config.lr,
            target_lr=wandb.config.target_lr,
            nepochs=wandb.config.epochs,
            iters_per_epoch=len(train_iter),
            step_epoch=wandb.config.step.get('step_epoch'),
            step_factor=wandb.config.step.get('step_factor'),
            power=wandb.config.poly.get('power'))
        trainer = Trainer(net.collect_params(),
                          wandb.config.optimizer,
                          optimizer_params={
                              'lr_scheduler': scheduler,
                              'wd': wandb.config.wd,
                              'momentum': wandb.config.momentum,
                              'multi_precision': True
                          })
    else:
        raise RuntimeError(f"Unknown optimizer: {wandb.config.optimizer}")

    metric = SegmentationMetric(data_factory.num_class)

    logger = get_logger(name='train', level=10)
    t_start = my_tools.get_strftime()
    logger.info(f'Training start: {t_start}')
    for k, v in wandb.config.items():
        logger.info(f'{k}: {v}')
    logger.info('-----> end hyper-parameters <-----')
    wandb.config.start_time = t_start

    best_score = .0
    best_epoch = 0
    for epoch in range(wandb.config.epochs):
        train_loss = .0
        tbar = tqdm(train_iter)
        for i, (data, target) in enumerate(tbar):
            gpu_datas = split_and_load(data, ctx_list=ctx)
            gpu_targets = split_and_load(target, ctx_list=ctx)
            with autograd.record():
                loss_gpus = [
                    criterion(*net(gpu_data), gpu_target)
                    for gpu_data, gpu_target in zip(gpu_datas, gpu_targets)
                ]
            for loss in loss_gpus:
                autograd.backward(loss)
            trainer.step(wandb.config.bs_train)
            nd.waitall()
            train_loss += sum([loss.mean().asscalar()
                               for loss in loss_gpus]) / len(loss_gpus)
            tbar.set_description(
                'Epoch-%d [training], loss %.5f, %s' %
                (epoch, train_loss /
                 (i + 1), my_tools.get_strftime('%Y-%m-%d %H:%M:%S')))
            if (i % log_interval == 0) or (i + 1 == len(train_iter)):
                wandb.log({
                    f'train_loss_batch, interval={log_interval}':
                    train_loss / (i + 1)
                })

        wandb.log({
            'train_loss_epoch': train_loss / (len(train_iter)),
            'custom_step': epoch
        })

        if not no_val:
            val_loss = .0
            vbar = tqdm(val_iter)
            for i, (data, target) in enumerate(vbar):
                gpu_datas = split_and_load(data=data,
                                           ctx_list=ctx,
                                           even_split=False)
                gpu_targets = split_and_load(data=target,
                                             ctx_list=ctx,
                                             even_split=False)
                loss_gpus = []
                for gpu_data, gpu_target in zip(gpu_datas, gpu_targets):
                    gpu_output = net(gpu_data)
                    loss_gpus.append(criterion(*gpu_output, gpu_target))
                    metric.update(gpu_target, gpu_output[0])
                val_loss += sum([loss.mean().asscalar()
                                 for loss in loss_gpus]) / len(loss_gpus)
                vbar.set_description(
                    'Epoch-%d [validation], PA %.4f, mIoU %.4f' %
                    (epoch, metric.get()[0], metric.get()[1]))
                nd.waitall()
            pix_acc, mean_iou = metric.get()
            wandb.log({
                'val_PA': pix_acc,
                'val_mIoU': mean_iou,
                'val_loss': val_loss / len(val_iter),
                'custom_step': epoch
            })
            metric.reset()
            if mean_iou > best_score:
                my_tools.save_checkpoint(
                    model=net,
                    model_name=wandb.config.model_name.lower(),
                    backbone=wandb.config.backbone.lower(),
                    data_name=wandb.config.data_name.lower(),
                    time_stamp=wandb.config.start_time,
                    is_best=True)
                best_score = mean_iou
                best_epoch = epoch

    logger.info(
        f'Best val mIoU={round(best_score * 100, 2)} at epoch: {best_epoch}')
    wandb.config.best_epoch = best_epoch
    my_tools.save_checkpoint(model=net,
                             model_name=wandb.config.model_name.lower(),
                             backbone=wandb.config.backbone.lower(),
                             data_name=wandb.config.data_name.lower(),
                             time_stamp=wandb.config.start_time,
                             is_best=False)
Ejemplo n.º 22
0

#8.3.1-CPU和GPU的并行计算
def run(x):
    return [nd.dot(x, x) for _ in range(10)]


#分别在内存和显存上创建NDArray
x_cpu = nd.random.uniform(shape=(2000, 2000))
print('x_gpu')
x_gpu = nd.random.uniform(shape=(6000, 6000), ctx=mx.gpu(0))
print('dayin')
#打印
run(x_cpu)  #预热开始
run(x_gpu)
nd.waitall()  #预热结束

with d2l.Benchmark('Run on CPU.'):
    run(x_cpu)
    nd.waitall()

with d2l.Benchmark('Then run on GPU.'):
    run(x_gpu)
    nd.waitall()

#自动并行不同任务
with d2l.Benchmark('Run on both CPU and GPU in parallel.'):
    run(x_cpu)
    run(x_gpu)
    nd.waitall()
Ejemplo n.º 23
0
def train_net(net, train_iter, valid_iter, batch_size, trainer, ctx,
              num_epochs, lr_sch, save_prefix):
    logger.info("===================START TRAINING====================")
    if use_mxboard:
        sw = SummaryWriter(logdir='logs', flush_secs=5)
    cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
    cls_acc = mx.metric.Accuracy(name="train acc")
    top_acc = 0
    iter_num = 0
    #test_acc,test_loss = test_net(net, valid_iter, ctx)
    #sw.add_graph(net) #only hybrid block supported
    param_names = net.collect_params().keys()
    for epoch in range(num_epochs):
        train_loss = []
        t0 = time.time()
        if isinstance(train_iter, mx.io.MXDataIter):
            train_iter.reset()
        total = 0
        trainer.set_learning_rate(lr_sch(epoch))
        for batch in train_iter:
            iter_num += 1
            # print("iter ",iter_num," start")
            if isinstance(batch, mx.io.DataBatch):
                X, Y = batch.data[0], batch.label[0]
                #total += X.shape[0]
                #print(total)
            else:
                X, Y = batch
            #print(X.shape,Y.shape)
            #print(Y)
            X = X.as_in_context(ctx)
            Y = Y.as_in_context(ctx)
            with autograd.record(True):
                out = net(X)
                #out = out.as_in_context(mx.cpu())
                loss = cls_loss(out, Y)
        # print(out.asnumpy()[0])
        # print('loss = ',loss.sum().asscalar())
            loss.backward()
            train_loss.append(loss.sum().asscalar())
            trainer.step(batch_size)
            cls_acc.update(Y, out)
            nd.waitall()
            #print("iter ",iter_num," end")
            if use_mxboard:
                if iter_num % 100 == 0:
                    sw.add_scalar(tag='train_loss',
                                  value=loss.mean().asscalar(),
                                  global_step=iter_num)
                    sw.add_scalar(tag='train_acc',
                                  value=cls_acc.get(),
                                  global_step=iter_num)
                if iter_num % 100 == 0:
                    for name in net.collect_params():
                        param = net.collect_params()[name]
                        if param.grad_req != "null":
                            sw.add_histogram(tag=name,
                                             values=param.grad(),
                                             global_step=iter_num,
                                             bins=1000)

        logger.info("epoch {} lr {} {}sec".format(epoch, trainer.learning_rate,
                                                  time.time() - t0))
        train_loss, train_acc = np.mean(train_loss) / batch_size, cls_acc.get()
        logger.info("\ttrain loss {} {}".format(train_loss, train_acc))
        if epoch > 0 and (epoch % 10) == 0:
            test_acc, test_loss = test_net(net, valid_iter, ctx)
            if use_mxboard:
                sw.add_scalar(tag='test_acc',
                              value=test_acc,
                              global_step=epoch)
                sw.add_scalar(tag='test_loss',
                              value=test_loss,
                              global_step=epoch)
            if top_acc < test_acc:
                top_acc = test_acc
                logger.info('\ttop valid acc {}'.format(test_acc))
                if isinstance(net, mx.gluon.nn.HybridSequential) or isinstance(
                        net, mx.gluon.nn.HybridBlock):
                    pf = '{}_{:.3f}.params'.format(save_prefix, top_acc)
                    net.export(pf, epoch)
                else:
                    net_path = '{}top_acc_{}_{:.3f}.params'.format(
                        save_prefix, epoch, top_acc)
                    net.save_parameters(net_path)

    if use_mxboard:
        sw.close()
def test_copy():
    a = nd.ones((SMALL_Y, LARGE_X))
    b = a.copy()
    nd.waitall()
    assert b.shape == a.shape
    assert b.size == LARGE_SIZE
Ejemplo n.º 25
0
def train(train_data,
          net,
          loss,
          ctx,
          global_step,
          epoch_step,
          num_epochs,
          best_F1=0):
    print("Start training on ", ctx)

    if isinstance(ctx, mx.Context):
        ctx = [ctx]

    for epoch in range(num_epochs):
        if epoch < 50:
            trainer = gluon.Trainer(net.collect_params(), 'adam', {
                'learning_rate': 0.001,
                'wd': 1e-3
            })
        elif epoch < 90:
            trainer = gluon.Trainer(net.collect_params(), 'adam', {
                'learning_rate': 0.0001,
                'wd': 1e-3
            })
        elif epoch < 120:
            trainer = gluon.Trainer(net.collect_params(), 'adam', {
                'learning_rate': 0.00001,
                'wd': 1e-3
            })
        else:
            trainer = gluon.Trainer(net.collect_params(), 'sgd', {
                'learning_rate': 0.000001,
                'momentum': 0.9,
                'wd': 1e-3
            })
        train_loss, n, = 0.0, 0.0
        TP, TN, FP, FN = 0, 0, 0, 0
        start = time()
        for i, batch in enumerate(train_data):
            data, label, batch_size = get_batch(batch, ctx)
            losses = []
            with autograd.record():
                outputs = [net(X) for X in data]
                losses = [loss(yhat, y) for yhat, y in zip(outputs, label)]

            for l in losses:
                l.backward()
            sw.add_scalar(tag='cross_entropy',
                          value=l.mean().asscalar(),
                          global_step=global_step)
            global_step += 1

            train_loss += sum([l.sum().asscalar() for l in losses])
            n += batch_size

            trainer.step(batch_size)
        for data, label in test_data:
            data = data.as_in_context(ctx[0])
            label = label.as_in_context(ctx[0])
            pred = net(data)
            nd.waitall()
            pred = nd.sigmoid(pred)
            pred = (pred > 0.5).reshape(-1, 256, 256)

            TPt = nd.sum(pred * label).asscalar()
            FPt = nd.sum(pred - (pred * label)).asscalar()
            FNt = nd.sum(label - (pred * label)).asscalar()
            TNt = nd.sum((1 - pred) * (1 - label)).asscalar()

            TP = TP + TPt
            FP = FP + FPt
            FN = FN + FNt
            TN = TN + TNt

        ACC = (TP + TN) / (TP + TN + FP + FN + 1e-15)
        TPR = TP / (TP + FN + 1e-15)
        TNR = TN / (FP + TN + 1e-15)
        PPV = TP / (TP + FP + 1e-15)
        F1 = 2 * PPV * TPR / (PPV + TPR + 1e-15)

        sw.add_scalar(tag='test_acc', value=ACC, global_step=epoch_step)
        sw.add_scalar(tag='test_TPR', value=TPR, global_step=epoch_step)
        sw.add_scalar(tag='test_TNR', value=TNR, global_step=epoch_step)
        sw.add_scalar(tag='test_PPV', value=PPV, global_step=epoch_step)
        sw.add_scalar(tag='F1', value=F1, global_step=epoch_step)
        epoch_step += 1
        print('EPOCH', epoch)
        print('test_acc=', ACC)
        print('test_TPR=', TPR)
        print('test_TNR=', TNR)
        print('test_PPV=', PPV)
        print('F1=', F1)

        if F1 > best_F1:
            net.save_parameters('u_e1.params')
            best_F1 = F1
        if epoch == 0:
            sw.add_graph(net)

        print('train_loss=', train_loss / n)
        print('time:', time() - start)
    sw.close()
    net.export("mynet", epoch)
Ejemplo n.º 26
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if config.train_cfg.param_init:
            init_func = getattr(mx.init, config.train_cfg.init)
            net.initialize(init_func(), ctx=ctx, force_reinit=True)
        else:
            net.load_parameters(config.train_cfg.param_file, ctx=ctx)

        summary(net, stat_name, nd.uniform(
            shape=(1, 3, imgsize, imgsize), ctx=ctx[0]))
        # net = nn.HybridBlock()
        net.hybridize()

        root = config.dir_cfg.dataset
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer_arg = {'learning_rate': config.lr_cfg.lr,
                       'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch}
        extra_arg = eval(config.lr_cfg.extra_arg)
        trainer_arg.update(extra_arg)
        trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg)
        if config.train_cfg.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if config.data_cfg.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        # print('start training')
        sig_state.emit(1)
        sig_pgbar.emit(0)
        # signal.emit('Training')
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1
            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    profiler.set_state('run')
                    is_profiler_run = True
                if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard:
                    sw.add_graph(net)
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not config.data_cfg.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not config.data_cfg.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if config.train_cfg.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                if config.save_cfg.tensorboard:
                    sw.add_scalar(tag='lr', value=trainer.learning_rate,
                                  global_step=iteration)
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    nd.waitall()
                    profiler.set_state('stop')
                    profiler.dump()
                iteration += 1
                sig_pgbar.emit(iteration)
                if check_flag()[0]:
                    sig_state.emit(2)
                while(check_flag()[0] or check_flag()[1]):
                    if check_flag()[1]:
                        print('stop')
                        return
                    else:
                        time.sleep(5)
                        print('pausing')

            epoch_time = time.time() - tic
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            # if config.data_cfg.mixup:
            #     train_history.update([acc, 1-val_acc])
            #     plt.cla()
            #     train_history.plot(save_path='%s/%s_history.png' %
            #                        (plot_name, model_name))
            # else:
            train_history.update([1-train_acc, 1-val_acc])
            plt.cla()
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)

            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time))
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            if config.save_cfg.tensorboard:
                sw._add_scalars(tag='Acc',
                                scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
                sw._add_scalars(tag='Loss',
                                scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)

            sig_table.emit([epoch, train_loss, train_acc,
                            val_loss, val_acc, current_lr, epoch_time])
            csv_writer.writerow([epoch, train_loss, train_acc,
                                 val_loss, val_acc, current_lr, epoch_time])
            csv_file.flush()

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))
Ejemplo n.º 27
0
def benchmark(net, X):
    start = time.time()
    for i in range(1000):
        _ = net(X)
    nd.waitall()
    return time.time() - start
Ejemplo n.º 28
0
def _engine_cond(cond_type='scaffold',
                 file_name='datasets/ChEMBL_scaffold.txt',
                 num_scaffolds=734,
                 is_full=False,
                 ckpt_dir='ckpt/scaffold',
                 num_folds=5,
                 fold_id=0,
                 batch_size=50,
                 batch_size_test=100,
                 num_workers=2,
                 k=5,
                 p=0.8,
                 F_e=16,
                 F_h=(32, 64, 128, 128, 256, 256),
                 F_skip=256,
                 F_c=(512, ),
                 Fh_policy=128,
                 activation='relu',
                 N_rnn=3,
                 gpu_ids=(0, 1, 2, 3),
                 lr=1e-3,
                 decay=0.015,
                 decay_step=100,
                 clip_grad=3.0,
                 iterations=30000,
                 summary_step=200):
    if all([
            os.path.isfile(os.path.join(ckpt_dir, _n))
            for _n in ['log.out', 'ckpt.params', 'trainer.status']
    ]):
        is_continuous = True
    else:
        is_continuous = False

    if is_full:
        if cond_type != 'kinase':
            if cond_type == 'scaffold':
                cond = data.SparseFP(num_scaffolds)
                N_C = num_scaffolds
            elif cond_type == 'prop':
                cond = data.Delimited()
                N_C = 2
            else:
                raise ValueError

            with open(file_name) as f:
                dataset = data.Lambda(f.readlines(),
                                      lambda _x: _x.strip('\n').strip('\r'))

            # get sampler and loader for training set
            sampler_train = data.BalancedSampler(
                cost=[len(l.split('\t')[0]) for l in dataset],
                batch_size=batch_size)
            loader_train = data.CMolRNNLoader(dataset,
                                              batch_sampler=sampler_train,
                                              num_workers=num_workers,
                                              k=k,
                                              p=p,
                                              conditional=cond)

            loader_test = []
        else:
            cond = data.Delimited()
            N_C = 2

            if all([
                    os.path.isfile(os.path.join(ckpt_dir, _n))
                    for _n in ['log.out', 'ckpt.params', 'trainer.status']
            ]):
                is_continuous = True
            else:
                is_continuous = False

            with open(file_name) as f:
                dataset = data.Lambda(f.readlines(),
                                      lambda _x: _x.strip('\n').strip('\r'))

            # get dataset
            def _filter(_line, _i):
                return int(_line.split('\t')[-1]) == _i

            db_train = data.Lambda(data.Filter(
                dataset, fn=lambda _x: not _filter(_x, fold_id)),
                                   fn=lambda _x: _x[:-2])
            db_test = data.Lambda(data.Filter(
                dataset, fn=lambda _x: _filter(_x, fold_id)),
                                  fn=lambda _x: _x[:-2])

            # get sampler and loader for test set
            loader_test = data.CMolRNNLoader(db_test,
                                             shuffle=True,
                                             num_workers=num_workers,
                                             k=k,
                                             p=p,
                                             conditional=cond,
                                             batch_size=batch_size_test)

            # get sampler and loader for training set
            loader_train = data.CMolRNNLoader(db_train,
                                              shuffle=True,
                                              num_workers=num_workers,
                                              k=k,
                                              p=p,
                                              conditional=cond,
                                              batch_size=batch_size)

        # get iterator
        it_train, it_test = iter(loader_train), iter(loader_test)
    else:
        if cond_type != 'kinase':
            if cond_type == 'scaffold':
                cond = data.SparseFP(num_scaffolds)
                N_C = num_scaffolds
            elif cond_type == 'prop':
                cond = data.Delimited()
                N_C = 2
            else:
                raise ValueError

            if all([
                    os.path.isfile(os.path.join(ckpt_dir, _n))
                    for _n in ['log.out', 'ckpt.params', 'trainer.status']
            ]):
                is_continuous = True
            else:
                is_continuous = False

            with open(file_name) as f:
                dataset = data.Lambda(f.readlines(),
                                      lambda _x: _x.strip('\n').strip('\r'))

            # get dataset
            db_train = data.KFold(dataset,
                                  k=num_folds,
                                  fold_id=fold_id,
                                  is_train=True)
            db_test = data.KFold(dataset,
                                 k=num_folds,
                                 fold_id=fold_id,
                                 is_train=False)

            # get sampler and loader for training set
            sampler_train = data.BalancedSampler(
                cost=[len(l.split('\t')[0]) for l in db_train],
                batch_size=batch_size)
            loader_train = data.CMolRNNLoader(db_train,
                                              batch_sampler=sampler_train,
                                              num_workers=num_workers,
                                              k=k,
                                              p=p,
                                              conditional=cond)

            # get sampler and loader for test set
            sampler_test = data.BalancedSampler(
                cost=[len(l.split('\t'[0])) for l in db_test],
                batch_size=batch_size_test)
            loader_test = data.CMolRNNLoader(db_test,
                                             batch_sampler=sampler_test,
                                             num_workers=num_workers,
                                             k=k,
                                             p=p,
                                             conditional=cond)

        else:
            cond = data.Delimited()
            N_C = 2

            if all([
                    os.path.isfile(os.path.join(ckpt_dir, _n))
                    for _n in ['log.out', 'ckpt.params', 'trainer.status']
            ]):
                is_continuous = True
            else:
                is_continuous = False

            with open(file_name) as f:
                dataset = data.Lambda(f.readlines(),
                                      lambda _x: _x.strip('\n').strip('\r'))

            # get dataset
            def _filter(_line, _i):
                return int(_line.split('\t')[-1]) == _i

            db_train = data.Lambda(data.Filter(
                dataset, fn=lambda _x: not _filter(_x, fold_id)),
                                   fn=lambda _x: _x[:-2])
            db_test = data.Lambda(data.Filter(
                dataset, fn=lambda _x: _filter(_x, fold_id)),
                                  fn=lambda _x: _x[:-2])

            # get sampler and loader for training set
            loader_train = data.CMolRNNLoader(db_train,
                                              shuffle=True,
                                              num_workers=num_workers,
                                              k=k,
                                              p=p,
                                              conditional=cond,
                                              batch_size=batch_size)

            # get sampler and loader for test set
            loader_test = data.CMolRNNLoader(db_test,
                                             shuffle=True,
                                             num_workers=num_workers,
                                             k=k,
                                             p=p,
                                             conditional=cond,
                                             batch_size=batch_size_test)

        # get iterator
        it_train, it_test = iter(loader_train), iter(loader_test)

    # build model
    if not is_continuous:
        configs = {
            'N_C': N_C,
            'F_e': F_e,
            'F_h': F_h,
            'F_skip': F_skip,
            'F_c': F_c,
            'Fh_policy': Fh_policy,
            'activation': activation,
            'rename': True,
            'N_rnn': N_rnn
        }
        with open(os.path.join(ckpt_dir, 'configs.json'), 'w') as f:
            json.dump(configs, f)
    else:
        with open(os.path.join(ckpt_dir, 'configs.json')) as f:
            configs = json.load(f)

    model = models.CVanillaMolGen_RNN(get_mol_spec().num_atom_types,
                                      get_mol_spec().num_bond_types,
                                      D=2,
                                      **configs)

    ctx = [mx.gpu(i) for i in gpu_ids]
    model.collect_params().initialize(mx.init.Xavier(),
                                      force_reinit=True,
                                      ctx=ctx)
    if not is_continuous:
        if cond_type == 'kinase':
            model.load_params(os.path.join(ckpt_dir, 'ckpt.params.bk'),
                              ctx=ctx,
                              allow_missing=True)
    else:
        model.load_params(os.path.join(ckpt_dir, 'ckpt.params'), ctx=ctx)

    # construct optimizer
    opt = mx.optimizer.Adam(learning_rate=lr, clip_gradient=clip_grad)
    trainer = gluon.Trainer(model.collect_params(), opt)
    if is_continuous:
        trainer.load_states(os.path.join(ckpt_dir, 'trainer.status'))

    if not is_continuous:
        t0 = time.time()
        global_counter = 0
    else:
        with open(os.path.join(ckpt_dir, 'log.out')) as f:
            records = f.readlines()
            if records[-1] != 'Training finished\n':
                final_record = records[-1]
            else:
                final_record = records[-2]
        count, t_final = int(final_record.split('\t')[0]), float(
            final_record.split('\t')[1])
        t0 = time.time() - t_final * 60
        global_counter = count

    with open(os.path.join(ckpt_dir, 'log.out'),
              mode='w' if not is_continuous else 'a') as f:
        if not is_continuous:
            f.write('step\ttime(h)\tloss\tlr\n')
        while True:
            global_counter += 1

            try:
                inputs = [next(it_train) for _ in range(len(gpu_ids))]
            except StopIteration:
                it_train = iter(loader_train)
                inputs = [next(it_train) for _ in range(len(gpu_ids))]

            # move to gpu
            inputs = [
                data.CMolRNNLoader.from_numpy_to_tensor(input_i, j)
                for j, input_i in zip(gpu_ids, inputs)
            ]

            with autograd.record():
                loss = [(model(*input_i)).as_in_context(mx.gpu(gpu_ids[0]))
                        for input_i in inputs]
                loss = sum(loss) / len(gpu_ids)
                loss.backward()

            nd.waitall()
            gc.collect()

            trainer.step(batch_size=1)

            if global_counter % decay_step == 0:
                trainer.set_learning_rate(trainer.learning_rate *
                                          (1.0 - decay))

            if global_counter % summary_step == 0:
                if is_full:
                    loss = np.asscalar((sum(loss) / len(gpu_ids)).asnumpy())
                else:
                    del loss, inputs
                    gc.collect()

                    try:
                        inputs = [next(it_test) for _ in range(len(gpu_ids))]
                    except StopIteration:
                        it_test = iter(loader_test)
                        inputs = [next(it_test) for _ in range(len(gpu_ids))]

                    with autograd.predict_mode():
                        # move to gpu
                        inputs = [
                            data.CMolRNNLoader.from_numpy_to_tensor(
                                input_i, j)
                            for j, input_i in zip(gpu_ids, inputs)
                        ]
                        loss = [
                            (model(*input_i)).as_in_context(mx.gpu(gpu_ids[0]))
                            for input_i in inputs
                        ]
                        loss = np.asscalar(
                            (sum(loss) / len(gpu_ids)).asnumpy())

                model.save_params(os.path.join(ckpt_dir, 'ckpt.params'))
                trainer.save_states(os.path.join(ckpt_dir, 'trainer.status'))

                f.write('{}\t{}\t{}\t{}\n'.format(global_counter,
                                                  float(time.time() - t0) / 60,
                                                  loss, trainer.learning_rate))
                f.flush()

                del loss, inputs
                gc.collect()

            if global_counter >= iterations:
                break

        # save before exit
        model.save_params(os.path.join(ckpt_dir, 'ckpt.params'))
        trainer.save_states(os.path.join(ckpt_dir, 'trainer.status'))

        f.write('Training finished\n')
Ejemplo n.º 29
0
                                          target.reshape(-1, ))
            L = batch_L / data.size
            hiddens = h
        L.backward()
        grads = [p.grad() for p in net.collect_params().values()]
        gluon.utils.clip_global_norm(grads, grad_clip)
        trainer.step(1)

if mpi_rank == 0:
    params_prev = [
        param.data().copy() for param in net.collect_params().values()
    ]
else:
    params_prev = None

nd.waitall()

# broadcast
params_prev = mpi_comm.bcast(params_prev, root=0)
for param, param_prev in zip(net.collect_params().values(), params_prev):
    param.set_data(param_prev)

if mpi_rank == 0:
    worker_list = list(range(mpi_size))

training_file_index_list = [i for i in range(len(training_files))]

alpha = args.alpha

randperm_choice_list = []
randperm_list = [i for i in range(args.nsplit)]
def test_expand_dims():
    a = nd.array(np.ones((SMALL_Y, LARGE_X)))
    b = nd.expand_dims(a, axis=1)
    nd.waitall()
    assert b.shape == (SMALL_Y, 1, LARGE_X)
def benchmark(net, x):
    start = time.time()
    for i in range(1000):
        _ = net(x)
    nd.waitall()  # 等待所有计算完成方便计时
    return time.time() - start