Exemple #1
0
def train():
    """Train function."""
    args = get_args("train")
    if args.need_profiler:
        from mindspore.profiler.profiling import Profiler
        profiler = Profiler(output_path=args.outputs_dir,
                            is_detail=True,
                            is_show_op_path=True)
    ds = create_dataset(args)
    G_A = get_generator(args)
    G_B = get_generator(args)
    D_A = get_discriminator(args)
    D_B = get_discriminator(args)
    load_ckpt(args, G_A, G_B, D_A, D_B)
    imgae_pool_A = ImagePool(args.pool_size)
    imgae_pool_B = ImagePool(args.pool_size)
    generator = Generator(G_A, G_B, args.lambda_idt > 0)

    loss_D = DiscriminatorLoss(args, D_A, D_B)
    loss_G = GeneratorLoss(args, generator, D_A, D_B)
    optimizer_G = nn.Adam(generator.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)
    optimizer_D = nn.Adam(loss_D.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)

    net_G = TrainOneStepG(loss_G, generator, optimizer_G)
    net_D = TrainOneStepD(loss_D, optimizer_D)

    data_loader = ds.create_dict_iterator()
    reporter = Reporter(args)
    reporter.info('==========start training===============')
    for _ in range(args.max_epoch):
        reporter.epoch_start()
        for data in data_loader:
            img_A = data["image_A"]
            img_B = data["image_B"]
            res_G = net_G(img_A, img_B)
            fake_A = res_G[0]
            fake_B = res_G[1]
            res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A),
                          imgae_pool_B.query(fake_B))
            reporter.step_end(res_G, res_D)
            reporter.visualizer(img_A, img_B, fake_A, fake_B)
        reporter.epoch_end(net_G)
        if args.need_profiler:
            profiler.analyse()
            break

    reporter.info('==========end training===============')
Exemple #2
0
def eval_net():
    '''eval net'''
    if config.dataset == 'MR':
        instance = MovieReview(root_dir=config.data_path, maxlen=config.word_len, split=0.9)
    elif config.dataset == 'SUBJ':
        instance = Subjectivity(root_dir=config.data_path, maxlen=config.word_len, split=0.9)
    elif config.dataset == 'SST2':
        instance = SST2(root_dir=config.data_path, maxlen=config.word_len, split=0.9)
    device_target = config.device_target
    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
    if device_target == "Ascend":
        context.set_context(device_id=get_device_id())
    dataset = instance.create_test_dataset(batch_size=config.batch_size)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
    net = TextCNN(vocab_len=instance.get_dict_len(), word_len=config.word_len,
                  num_classes=config.num_classes, vec_length=config.vec_length)
    opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=0.001,
                  weight_decay=float(config.weight_decay))

    param_dict = load_checkpoint(config.checkpoint_file_path)
    print("load checkpoint from [{}].".format(config.checkpoint_file_path))

    load_param_into_net(net, param_dict)
    net.set_train(False)
    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc': Accuracy()})

    acc = model.eval(dataset)
    print("accuracy: ", acc)
Exemple #3
0
def get_train_optimizer(net, steps_per_epoch, args):
    """
    generate optimizer for updating the weights.
    """
    if args.optimizer == "Adam":
        lr = get_lr(lr_init=1e-4,
                    lr_end=1e-6,
                    lr_max=9e-4,
                    warmup_epochs=args.warmup_epochs,
                    total_epochs=args.epoch_size,
                    steps_per_epoch=steps_per_epoch,
                    lr_decay_mode="linear")
        lr = Tensor(lr)
        decayed_params = []
        no_decayed_params = []
        for param in net.trainable_params():
            if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
                decayed_params.append(param)
            else:
                no_decayed_params.append(param)
        group_params = [{
            'params': decayed_params,
            'weight_decay': args.weight_decay
        }, {
            'params': no_decayed_params
        }, {
            'order_params': net.trainable_params()
        }]
        optimizer = nn.Adam(params=group_params, learning_rate=lr)
    else:
        raise ValueError("Unsupported optimizer.")

    return optimizer
Exemple #4
0
def test_svi_vae():
    # define the encoder and decoder
    encoder = Encoder()
    decoder = Decoder()
    # define the vae model
    vae = VAE(encoder, decoder, hidden_size=400, latent_size=20)
    # define the loss function
    net_loss = ELBO(latent_prior='Normal', output_prior='Normal')
    # define the optimizer
    optimizer = nn.Adam(params=vae.trainable_params(), learning_rate=0.001)
    # define the training dataset
    ds_train = create_dataset(image_path, 128, 1)
    net_with_loss = nn.WithLossCell(vae, net_loss)
    # define the variational inference
    vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer)
    # run the vi to return the trained network.
    vae = vi.run(train_dataset=ds_train, epochs=5)
    # get the trained loss
    trained_loss = vi.get_train_loss()
    # test function: generate_sample
    generated_sample = vae.generate_sample(64, IMAGE_SHAPE)
    # test function: reconstruct_sample
    for sample in ds_train.create_dict_iterator():
        sample_x = Tensor(sample['image'], dtype=mstype.float32)
        reconstructed_sample = vae.reconstruct_sample(sample_x)
    print('The loss of the trained network is ', trained_loss)
    print('The hape of the generated sample is ', generated_sample.shape)
    print('The shape of the reconstructed sample is ',
          reconstructed_sample.shape)
Exemple #5
0
    def __init__(self, network, total_steps=1, sens=16384.0):
        super(TrainStepWrap, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_train()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())

        lr = dynamic_lr(0.01, total_steps, 5000)
        self.optimizer = nn.Adam(self.weights,
                                 learning_rate=lr,
                                 beta1=0.9,
                                 beta2=0.999,
                                 eps=1e-8,
                                 loss_scale=sens)

        self.hyper_map = C.HyperMap()
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens

        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(
                self.optimizer.parameters, mean, degree)
Exemple #6
0
def test_adam_group1():
    """ test_adam_group_lr_and_weight_decay """
    inputs = Tensor(np.ones([1, 64]).astype(np.float32))
    label = Tensor(np.zeros([1, 10]).astype(np.float32))
    net = Net()
    net.set_train()
    loss = nn.SoftmaxCrossEntropyWithLogits()
    net_with_loss = WithLossCell(net, loss)
    all_params = net.trainable_params()

    poly_decay_lr = polynomial_decay_lr(0.01,
                                        0.0001,
                                        total_step=10,
                                        step_per_epoch=1,
                                        decay_epoch=3,
                                        power=1.0)

    group_params = [{
        'params': [all_params[0]],
        'lr': poly_decay_lr,
        'weight_decay': 0.9
    }, {
        'params': [all_params[1]]
    }]
    optimizer = nn.Adam(group_params, learning_rate=0.1)

    train_network = TrainOneStepCell(net_with_loss, optimizer)
    _executor.compile(train_network, inputs, label)
    def train_mindspore_impl(self):
        input_ = Tensor(np.random.randn(self.batch_num, self.input_channels).astype(np.float32))
        weight_np = Tensor(np.random.randn(self.output_channels, self.input_channels).astype(np.float32))
        bias = Tensor(np.random.randn(self.output_channels).astype(np.float32))

        label_np = np.random.randint(self.output_channels, size=self.batch_num)
        label_np_onehot = np.zeros(shape=(self.batch_num, self.output_channels)).astype(np.float32)
        label_np_onehot[np.arange(self.batch_num), label_np] = 1.0
        label = Tensor(label_np_onehot)

        ms_dense = Dense(in_channels=self.input_channels,
                         out_channels=self.output_channels,
                         weight_init=weight_np,
                         bias_init=bias, has_bias=True)
        criterion = SoftmaxCrossEntropyWithLogits()
        optimizer = nn.Adam(ms_dense.trainable_params(),
                            learning_rate=1e-3,
                            beta1=0.9, beta2=0.999, eps=self.epsilon,
                            use_locking=False,
                            use_nesterov=False, weight_decay=0.0,
                            loss_scale=1.0)

        net_with_criterion = WithLossCell(ms_dense, criterion)
        train_network = TrainOneStepCell(net_with_criterion, optimizer)
        train_network.set_train()

        print('MS Initialized!')
        for _ in range(self.epoch):
            train_network(input_, label)
        output = ms_dense(input_)
        print("===============output=================", output)
        return output.asnumpy()
Exemple #8
0
def train():
    context.set_context(
        mode=context.GRAPH_MODE,
        device_target="Ascend",
        #save_graphs=True,
        #save_graphs_path="/home/work/user-job-dir/EAST/",
        #enable_reduce_precision=False,
        #device_id=5
    )

    epoch = 600

    my_dataset.download_dataset()
    train_img_path = os.path.abspath('/cache/train_img')
    train_gt_path = os.path.abspath('/cache/train_gt')
    #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=8, length=512, scale=0.25)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=24, length=512, scale=0.25)
    #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2)
    #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img')
    #train_gt_path  = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt')
    dataset = datasetV2.create_icdar_train_dataset(train_img_path,
                                                   train_gt_path,
                                                   batch_size=14,
                                                   repeat_num=1,
                                                   is_training=True,
                                                   num_parallel_workers=24)
    #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24)
    dataset_size = dataset.get_dataset_size()

    print("Create dataset done!, dataset_size: ", dataset_size)

    #east = EAST.EAST()
    net = EAST_VGG.EAST()

    #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20)
    #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config)

    milestone = [100, 300]
    learning_rates = [1e-3, 1e-4]
    lr = piecewise_constant_lr(milestone, learning_rates)
    opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()),
                  learning_rate=lr)
    net = my_loss.EASTWithLossCell(net)
    net = my_loss.TrainingWrapper(net, opt)
    net.set_train(True)

    callback = [TimeMonitor(data_size=dataset_size),
                LossMonitor()]  #, ckpoint_cb]

    model = Model(net)
    dataset_sink_mode = False
    print("start trainig")
    model.train(epoch,
                dataset,
                callbacks=callback,
                dataset_sink_mode=dataset_sink_mode)
Exemple #9
0
 def __init__(self, network, num_class, label, mask, learning_rate, l2_coeff):
     super(TrainGAT, self).__init__(auto_prefix=False)
     self.network = network
     loss_net = LossNetWrapper(network, num_class, label, mask, l2_coeff)
     optimizer = nn.Adam(loss_net.trainable_params(),
                         learning_rate=learning_rate)
     self.loss_train_net = TrainOneStepCell(loss_net, optimizer)
     self.accuracy_func = MaskedAccuracy(num_class, label, mask)
Exemple #10
0
 def __init__(self, network, label, mask, config):
     super(TrainNetWrapper, self).__init__(auto_prefix=False)
     self.network = network
     loss_net = LossWrapper(network, label, mask, config.weight_decay)
     optimizer = nn.Adam(loss_net.trainable_params(),
                         learning_rate=config.learning_rate)
     self.loss_train_net = TrainOneStepCell(loss_net, optimizer)
     self.accuracy = Accuracy(label, mask)
Exemple #11
0
def test_vae_gan():
    vae_gan = VaeGan()
    net_loss = VaeGanLoss()
    optimizer = nn.Adam(params=vae_gan.trainable_params(), learning_rate=0.001)
    ds_train = create_dataset(image_path, 128, 1)
    net_with_loss = nn.WithLossCell(vae_gan, net_loss)
    vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer)
    vae_gan = vi.run(train_dataset=ds_train, epochs=5)
Exemple #12
0
def get_eval_optimizer(net, steps_per_epoch, args):
    lr = get_lr(lr_init=1e-3,
                lr_end=6e-6,
                lr_max=1e-2,
                warmup_epochs=5,
                total_epochs=args.epoch_size,
                steps_per_epoch=steps_per_epoch,
                lr_decay_mode="linear")
    lr = Tensor(lr)
    optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr)
    return optimizer
Exemple #13
0
def train_net__(data_dir, seg_dir, run_distribute, config=None):

    train_data_size = 5
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)

    loss = SoftmaxCrossEntropyWithLogits()
    # loss = nn.DiceLoss()
    lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32)
    optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr)
    scale_manager = FixedLossScaleManager(config.loss_scale,
                                          drop_overflow_update=False)
    network.set_train()
    network.to_float(mstype.float16)
    _do_keep_batchnorm_fp32(network)
    network = _add_loss_network(network, loss, mstype.float16)
    loss_scale = 1.0
    loss_scale = scale_manager.get_loss_scale()
    update_cell = scale_manager.get_update_cell()
    if update_cell is not None:
        model = nn.TrainOneStepWithLossScaleCell(
            network, optimizer, scale_sense=update_cell).set_train()
    else:
        model = nn.TrainOneStepCell(network, optimizer, loss_scale).set_train()

    inputs = mindspore.Tensor(np.random.rand(1, 1, 224, 224, 96),
                              mstype.float32)
    labels = mindspore.Tensor(np.random.rand(1, 4, 224, 224, 96),
                              mstype.float32)

    step_per_epoch = train_data_size
    print("============== Starting Training ==============")
    # for epoch_id in range(1):
    for epoch_id in range(cfg.epoch_size):
        time_epoch = 0.0
        for step_id in range(step_per_epoch):
            # for step_id in range(1):
            time_start = time.time()
            loss = model(inputs, labels)
            # loss = network(inputs, labels)
            # loss = network(inputs)
            loss = loss.asnumpy()
            time_end = time.time()
            time_step = time_end - time_start
            time_epoch = time_epoch + time_step
            print(
                'Epoch: [%3d/%3d], step: [%5d/%5d], loss: [%6.4f], time: [%.4f]'
                % (epoch_id, cfg.epoch_size, step_id, step_per_epoch, loss,
                   time_step))
        print('Epoch time: %10.4f, per step time: %7.4f' %
              (time_epoch, time_epoch / step_per_epoch))

    print("============== End Training ==============")
Exemple #14
0
def train_net(data_dir,
              cross_valid_ind=1,
              epochs=400,
              batch_size=16,
              lr=0.0001,
              run_distribute=False,
              cfg=None):

    if run_distribute:
        init()
        group_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=group_size,
                                          gradients_mean=False)
    net = UNet(n_channels=cfg['num_channels'], n_classes=cfg['num_classes'])

    if cfg['resume']:
        param_dict = load_checkpoint(cfg['resume_ckpt'])
        load_param_into_net(net, param_dict)

    criterion = CrossEntropyWithLogits()
    train_dataset, _ = create_dataset(data_dir, epochs, batch_size, True,
                                      cross_valid_ind, run_distribute)
    train_data_size = train_dataset.get_dataset_size()
    print("dataset length is:", train_data_size)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=cfg['keep_checkpoint_max'])
    ckpoint_cb = ModelCheckpoint(prefix='ckpt_unet_medical_adam',
                                 directory='./ckpt_{}/'.format(device_id),
                                 config=ckpt_config)

    optimizer = nn.Adam(params=net.trainable_params(),
                        learning_rate=lr,
                        weight_decay=cfg['weight_decay'],
                        loss_scale=cfg['loss_scale'])

    loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager(
        cfg['FixedLossScaleManager'], False)

    model = Model(net,
                  loss_fn=criterion,
                  loss_scale_manager=loss_scale_manager,
                  optimizer=optimizer,
                  amp_level="O3")

    print("============== Starting Training ==============")
    model.train(
        1,
        train_dataset,
        callbacks=[StepLossTimeMonitor(batch_size=batch_size), ckpoint_cb],
        dataset_sink_mode=False)
    print("============== End Training ==============")
def test_sit_embedding_lookup_net():
    indices = Tensor(np.array([0, 1, 2]).astype(np.int32))
    label = Tensor(np.random.randn(3, 8).astype(np.float32))

    net1 = NetWithEmbeddingLookUp(vocab_size=8, embedding_size=8, target="CPU")
    loss = nn.SoftmaxCrossEntropyWithLogits(reduction="mean")
    optimizer1 = nn.Adam(params=net1.trainable_params(), learning_rate=0.1)
    optimizer1.unique = True
    train_network1 = TrainOneStepCell(WithLossCell(net1, loss), optimizer1)
    train_network1.set_train()
    out1 = train_network1(indices, label)

    net2 = NetWithEmbeddingLookUp(vocab_size=8, embedding_size=8, target="CPU")
    optimizer2 = nn.Adam(params=net2.trainable_params(), learning_rate=0.1)
    optimizer2.unique = False
    optimizer2.target = "CPU"
    train_network2 = TrainOneStepCell(WithLossCell(net2, loss), optimizer2)
    train_network2.set_train()
    out2 = train_network2(indices, label)

    assert np.allclose(out1.asnumpy(), out2.asnumpy(), 0.001, 0.001)
Exemple #16
0
def test_embedding_lookup_with_mix_precision():
    data = Tensor(np.array([0, 1, 2]).astype(np.int32))
    label = Tensor(np.random.randn(*(2, 3, 2, 2)).astype(np.float32))
    net = EmbeddingLookUpBnNet(8, 8, target='CPU')

    criterion = nn.SoftmaxCrossEntropyWithLogits(reduction='mean')
    optimizer = nn.Adam(params=net.trainable_params(), learning_rate=0.1)
    optimizer.sparse_opt.add_prim_attr("primitive_target", "CPU")
    train_network = ms.amp.build_train_network(net, optimizer, criterion, level="O2")
    train_network.set_train()
    for _ in range(2):
        train_network(data, label)
Exemple #17
0
def main():
    # We currently support pynative mode with device GPU
    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
    epoch_size = 1
    batch_size = 32
    mnist_path = "/data/chengzi/zhusuan-mindspore/data/MNIST"
    repeat_size = 1

    # Define model parameters
    z_dim = 40
    x_dim = 32 * 32

    # create the network
    generator = Generator(x_dim, z_dim, batch_size)
    variational = Variational(x_dim, z_dim, batch_size)
    network = zs.variational.ELBO(generator, variational)

    # define loss
    # learning rate setting
    lr = 0.001
    net_loss = ReduceMeanLoss()

    # define the optimizer
    print(network.trainable_params()[0])
    net_opt = nn.Adam(network.trainable_params(), lr)

    model = Model(network, net_loss, net_opt)

    ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size,
                              repeat_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[LossMonitor()],
                dataset_sink_mode=False)

    print(network.trainable_params()[0])

    iterator = ds_train.create_tuple_iterator()
    for item in iterator:
        batch_x = item[0].reshape(32, 32 * 32)
        break
    z, _ = network.variational(Tensor(batch_x), None, None)
    sample, _, _, _ = network.generator(None, z, None)
    sample = sample.asnumpy()
    save_img(batch_x, 'result/origin_x.png')
    save_img(sample, 'result/reconstruct_x.png')

    for i in range(4):
        sample, _, _, _ = network.generator(None, None, None)
        sample = sample.asnumpy()
        samples = sample if i == 0 else np.concatenate([samples, sample],
                                                       axis=0)
    save_img(samples, 'result/sample_x.png', num=4 * batch_size)
Exemple #18
0
def train_net(data_dir, seg_dir, run_distribute, config=None):
    if run_distribute:
        init()
        rank_id = get_rank()
        rank_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=rank_size,
                                          gradients_mean=True)
    else:
        rank_id = 0
        rank_size = 1
    # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \
    #                                 rank_size=rank_size, rank_id=rank_id, is_training=True)
    train_dataset = create_dataset_diy()
    # for item in train_dataset:
    #     print(item)
    # exit(0)

    train_data_size = train_dataset.get_dataset_size()
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)

    loss = SoftmaxCrossEntropyWithLogits()
    # loss = nn.DiceLoss()
    lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32)
    optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr)
    scale_manager = FixedLossScaleManager(config.loss_scale,
                                          drop_overflow_update=False)
    network.set_train()

    model = Model(network,
                  loss_fn=loss,
                  optimizer=optimizer,
                  loss_scale_manager=scale_manager,
                  amp_level='O3')

    time_cb = TimeMonitor(data_size=train_data_size)
    loss_cb = LossMonitor(per_print_times=2)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model),
                                 directory='./ckpt_{}/'.format(rank_size),
                                 config=ckpt_config)
    callbacks_list = [loss_cb, time_cb, ckpoint_cb]
    print("============== Starting Training ==============")
    model.train(config.epoch_size,
                train_dataset,
                callbacks=callbacks_list,
                dataset_sink_mode=False)
    print("============== End Training ==============")
Exemple #19
0
 def train_mindspore_impl(self, indices, epoch, batch_size, use_parallel=True):
     ds = FakeData(size=8, batch_size=batch_size, num_class=8, image_size=(), use_parallel=use_parallel)
     ds.set_image_data_type(np.int32)
     net = self
     net.set_train()
     loss = nn.SoftmaxCrossEntropyWithLogits()
     optimizer = nn.Adam(net.trainable_params())
     optimizer.target = "CPU"
     model = Model(net, loss, optimizer)
     for _ in range(epoch):
         model.train(1, ds, dataset_sink_mode=False)
     output = net(indices)
     return output
Exemple #20
0
    def __init__(self,
                 network,
                 neg_item_num,
                 l2_embed,
                 learning_rate,
                 epsilon,
                 dist_reg=0.002):
        super(TrainBGCF, self).__init__(auto_prefix=False)

        self.network = network
        loss_net = LossWrapper(network, neg_item_num, l2_embed, dist_reg)
        optimizer = nn.Adam(loss_net.trainable_params(),
                            learning_rate=learning_rate,
                            eps=epsilon)
        self.loss_train_net = TrainOneStepCell(loss_net, optimizer)
Exemple #21
0
def TrainWrap(net, loss_fn=None, optimizer=None, weights=None):
    """
    TrainWrap
    """
    if loss_fn is None:
        loss_fn = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)
    loss_net = nn.WithLossCell(net, loss_fn)
    loss_net.set_train()
    if weights is None:
        weights = ParameterTuple(net.trainable_params())
    if optimizer is None:
        optimizer = nn.Adam(weights, learning_rate=0.003, beta1=0.9, beta2=0.999, eps=1e-5, use_locking=False,
                            use_nesterov=False, weight_decay=4e-5, loss_scale=1.0)
    train_net = nn.TrainOneStepCell(loss_net, optimizer)
    return train_net
Exemple #22
0
def csd_train(train_loader, net, opt):
    set_seed(1)
    device_id = int(os.getenv('DEVICE_ID', '0'))
    print("[CSD] Start Training...")

    step_size = train_loader.get_dataset_size()
    lr = []
    for i in range(0, opt.epochs):
        cur_lr = opt.lr / (2 ** ((i + 1) // 200))
        lr.extend([cur_lr] * step_size)
    optim = nn.Adam(net.trainable_params(), learning_rate=lr, loss_scale=opt.loss_scale)

    # net_with_loss = NetWithLossCell(net)
    net_with_loss = NetWithCSDLossCell(net, args.contra_lambda, args.neg_num)
    train_cell = TrainOneStepCell(net_with_loss, optim)
    net.set_train()
    eval_net = net

    # time_cb = TimeMonitor(data_size=step_size)
    # loss_cb = LossMonitor()
    # metrics = {
    #     "psnr": PSNR(rgb_range=opt.rgb_range, shave=True),
    # }
    # eval_cb = EvalCallBack(eval_net, eval_ds, args.test_every, step_size / opt.batch_size, metrics=metrics,
    #                        rank_id=rank_id)
    # cb = [time_cb, loss_cb]
    # config_ck = CheckpointConfig(save_checkpoint_steps=opt.ckpt_save_interval * step_size,
    #                              keep_checkpoint_max=opt.ckpt_save_max)
    # ckpt_cb = ModelCheckpoint(prefix=opt.filename, directory=opt.ckpt_save_path, config=config_ck)
    # if device_id == 0:
        # cb += [ckpt_cb]

    for epoch in range(0, opt.epochs):
        epoch_loss = 0
        for iteration, batch in enumerate(train_loader.create_dict_iterator(), 1):
            lr = batch["LR"]
            hr = batch["HR"]

            loss = train_cell(lr, hr, Tensor(opt.stu_width_mult), Tensor(1.0))
            epoch_loss += loss

        print(f"Epoch[{epoch}] loss: {epoch_loss.asnumpy()}")
        # with eval_net.set_train(False):
        #     do_eval(eval_ds, eval_net)

        if (epoch) % 10 == 0:
            print('===> Saving model...')
            save_checkpoint(net, f'./ckpt/{opt.filename}.ckpt')
Exemple #23
0
 def __init__(self, args, loader, my_model):
     self.args = args
     self.scale = args.scale
     self.trainloader = loader
     self.model = my_model
     self.model.set_train()
     self.criterion = nn.L1Loss()
     self.con_loss = SupConLoss()
     self.optimizer = nn.Adam(self.model.trainable_params(),
                              learning_rate=args.lr,
                              loss_scale=1024.0)
     self.train_net = MyTrain(self.model,
                              self.criterion,
                              self.con_loss,
                              use_con=args.con_loss)
     self.bp = MyTrainOneStepCell(self.train_net, self.optimizer, 1024.0)
Exemple #24
0
def train(Net):
    ds_train, ds_test = create_dataset()
    # 构建网络
    network = Net(cfg.num_classes)
    # 定义模型的损失函数,优化器
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Adam(network.trainable_params(), cfg.lr)
    # 训练模型
    model = Model(network,
                  loss_fn=net_loss,
                  optimizer=net_opt,
                  metrics={'acc': Accuracy()})
    loss_cb = LossMonitor()
    print("============== Starting Training ==============")
    model.train(30, ds_train, callbacks=[loss_cb], dataset_sink_mode=True)
    # 验证
    metric = model.eval(ds_test)
    print(metric)

    return model
Exemple #25
0
def test_adam_group2():
    """ test_adam_group_lr_and_weight_decay """
    inputs = Tensor(np.ones([1, 64]).astype(np.float32))
    label = Tensor(np.zeros([1, 10]).astype(np.float32))
    net = Net()
    net.set_train()
    loss = nn.SoftmaxCrossEntropyWithLogits()
    net_with_loss = WithLossCell(net, loss)
    all_params = net.trainable_params()

    schedule_lr = lr_schedules.PolynomialDecayLR(0.01, 0.0001, 3, power=1.0)
    group_params = [{
        'params': [all_params[0]],
        'lr': 0.02,
        'weight_decay': 0.9
    }, {
        'params': [all_params[1]]
    }]
    optimizer = nn.Adam(group_params, learning_rate=schedule_lr)
    train_network = TrainOneStepCell(net_with_loss, optimizer)
    _executor.compile(train_network, inputs, label)
def test_svi_cvae():
    # define the encoder and decoder
    encoder = Encoder(num_classes=10)
    decoder = Decoder()
    # define the cvae model
    cvae = ConditionalVAE(encoder,
                          decoder,
                          hidden_size=400,
                          latent_size=20,
                          num_classes=10)
    # define the loss function
    net_loss = ELBO(latent_prior='Normal', output_prior='Normal')
    # define the optimizer
    optimizer = nn.Adam(params=cvae.trainable_params(), learning_rate=0.001)
    # define the training dataset
    ds_train = create_dataset(image_path, 128, 1)
    # define the WithLossCell modified
    net_with_loss = CVAEWithLossCell(cvae, net_loss)
    # define the variational inference
    vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer)
    # run the vi to return the trained network.
    cvae = vi.run(train_dataset=ds_train, epochs=5)
    # get the trained loss
    trained_loss = vi.get_train_loss()
    # test function: generate_sample
    sample_label = Tensor([i for i in range(0, 8)] * 8, dtype=mstype.int32)
    generated_sample = cvae.generate_sample(sample_label, 64, IMAGE_SHAPE)
    # test function: reconstruct_sample
    for sample in ds_train.create_dict_iterator(output_numpy=True,
                                                num_epochs=1):
        sample_x = Tensor(sample['image'], dtype=mstype.float32)
        sample_y = Tensor(sample['label'], dtype=mstype.int32)
        reconstructed_sample = cvae.reconstruct_sample(sample_x, sample_y)
    print('The loss of the trained network is ', trained_loss)
    print('The shape of the generated sample is ', generated_sample.shape)
    print('The shape of the reconstructed sample is ',
          reconstructed_sample.shape)
Exemple #27
0
        dataset = create_yolo_dataset(mindrecord_file, repeat_num=args_opt.epoch_size,
                                      batch_size=args_opt.batch_size, device_num=device_num, rank=rank)
        dataset_size = dataset.get_dataset_size()
        print("Create dataset done!")

        net = yolov3_resnet18(ConfigYOLOV3ResNet18())
        net = YoloWithLossCell(net, ConfigYOLOV3ResNet18())
        init_net_param(net, "XavierUniform")

        # checkpoint
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config)

        lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=0, global_step=args_opt.epoch_size * dataset_size,
                           decay_step=1000, decay_rate=0.95, steps=True))
        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

        if args_opt.checkpoint_path != "":
            param_dict = load_checkpoint(args_opt.checkpoint_path)
            load_param_into_net(net, param_dict)

        callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]

        model = Model(net)
        dataset_sink_mode = False
        if args_opt.mode == "sink":
            print("In sink mode, one epoch return a loss.")
            dataset_sink_mode = True
        print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.")
        model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)

if __name__ == "__main__":
    # define the encoder and decoder
    encoder = Encoder(num_classes=10)
    decoder = Decoder()
    # define the cvae model
    cvae = ConditionalVAE(encoder,
                          decoder,
                          hidden_size=400,
                          latent_size=20,
                          num_classes=10)
    # define the loss function
    net_loss = ELBO(latent_prior='Normal', output_prior='Normal')
    # define the optimizer
    optimizer = nn.Adam(params=cvae.trainable_params(), learning_rate=0.001)
    # define the training dataset
    ds_train = create_dataset(image_path, 128, 1)
    # define the WithLossCell modified
    net_with_loss = WithLossCell(cvae, net_loss)
    # define the variational inference
    vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer)
    # run the vi to return the trained network.
    cvae = vi.run(train_dataset=ds_train, epochs=10)
    # get the trained loss
    trained_loss = vi.get_train_loss()
    # test function: generate_sample
    sample_label = Tensor([i for i in range(0, 8)] * 8, dtype=mstype.int32)
    generated_sample = cvae.generate_sample(sample_label, 64, IMAGE_SHAPE)
    # test function: reconstruct_sample
    for sample in ds_train.create_dict_iterator():
Exemple #29
0
def main():
    parser = argparse.ArgumentParser(description="YOLOv3 train")
    parser.add_argument("--only_create_dataset", type=bool, default=False, help="If set it true, only create "
                                                                                "Mindrecord, default is false.")
    parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is false.")
    parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
    parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
    parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001.")
    parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink")
    parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10")
    parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
    parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path")
    parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size")
    parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
    parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
    parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord",
                        help="Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by"
                             "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir "
                             "rather than image_dir and anno_path. Default is ./Mindrecord_train")
    parser.add_argument('--data_url', type=str, default=None, help='Dataset path')
    parser.add_argument('--train_url', type=str, default=None, help='Train output path')
    parser.add_argument("--anno_path", type=str, default="", help="Annotation path.")
    args_opt = parser.parse_args()

    device_id = int(os.getenv('DEVICE_ID'))
    device_num = int(os.getenv('RANK_SIZE'))
    rankid = int(os.getenv('RANK_ID'))

    local_data_url = '/cache/data'
    local_train_url = '/cache/ckpt'
    local_anno_url = '/cache/anno'
    local_mindrecord_url = '/cache/mindrecord'
    mox.file.copy_parallel(args_opt.mindrecord_dir,local_mindrecord_url)


    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id)
    if args_opt.distribute:
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                          device_num=device_num)
        init()
        rank = rankid
        local_train_url = os.path.join(local_train_url,str(device_id))
    else:
        rank = 0
        device_num = 1

    print("Start create dataset!")

    # It will generate mindrecord file in args_opt.mindrecord_dir,
    # and the file name is yolo.mindrecord0, 1, ... file_num.
    if not os.path.isdir(local_mindrecord_url):
        os.makedirs(local_mindrecord_url)

    prefix = "train.mindrecord"
    mindrecord_file = os.path.join(local_mindrecord_url, prefix + "0")
    if not os.path.exists(mindrecord_file):
        mox.file.copy_parallel(args_opt.data_url,local_data_url)
        if args_opt.anno_path:
            anno_file=os.path.join(local_anno_url,os.path.split(args_opt.anno_path)[1])
        mox.file.copy_parallel(args_opt.anno_path,anno_file)
        if os.path.isdir(local_data_url) or os.path.exists(anno_file):
            print("Create Mindrecord.")
            data_to_mindrecord_byte_image(local_data_url,
                                          anno_file,
                                          local_mindrecord_url,
                                          prefix=prefix,
                                          file_num=8)
            print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir))
            mox.file.copy_parallel(local_mindrecord_url,args_opt.mindrecord_dir)
        else:
            print("image_dir or anno_path not exits.")

    if not args_opt.only_create_dataset:
        loss_scale = float(args_opt.loss_scale)

        # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0.
        dataset = create_yolo_dataset(mindrecord_file, repeat_num=args_opt.epoch_size,
                                      batch_size=args_opt.batch_size, device_num=device_num, rank=rank)
        dataset_size = dataset.get_dataset_size()
        print("Create dataset done!")

        net = yolov3_resnet18(ConfigYOLOV3ResNet18())
        net = YoloWithLossCell(net, ConfigYOLOV3ResNet18())
        init_net_param(net, "XavierUniform")

        # checkpoint
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=local_train_url, config=ckpt_config)

        if args_opt.pre_trained:
            if args_opt.pre_trained_epoch_size <= 0:
                raise KeyError("pre_trained_epoch_size must be greater than 0.")
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)
        total_epoch_size = 60
        if args_opt.distribute:
            total_epoch_size = 160
        lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size,
                           global_step=total_epoch_size * dataset_size,
                           decay_step=1000, decay_rate=0.95, steps=True))
        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

        callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]

        model = Model(net)
        dataset_sink_mode = False
        if args_opt.mode == "sink":
            print("In sink mode, one epoch return a loss.")
            dataset_sink_mode = True
        print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.")
        model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
        if device_id ==1:
            mox.file.copy_parallel(local_train_url,args_opt.train_url)
Exemple #30
0
def main():
    set_seed(1)
    date = time.strftime("%Y%m%d%H%M%S", time.localtime())
    print(f'* Preparing to train model {date}')

    # ************** configuration ****************
    # - training setting
    resume = config['resume']
    if config['mode'] == 'PYNATIVE':
        mode = context.PYNATIVE_MODE
    else:
        mode = context.GRAPH_MODE

    device = config['device']
    device_id = config['device_id']
    dataset_sink_mode = config['dataset_sink_mode']

    # use in dataset
    div = 8

    # setting bias and padding
    if resume:
        print('* Resuming model...')
        resume_config_log = config['resume_config_log']
        resume_config = get_eval_config(resume_config_log)
        if 'best_ckpt' in resume_config.keys():
            resume_model_path = resume_config['best_ckpt']
        else:
            resume_model_path = resume_config['latest_model']
            print('* [WARNING] Not using the best model, but latest saved model instead.')

        has_bias = resume_config['has_bias']
        use_dropout = resume_config['use_dropout']

        pad_mode = resume_config['pad_mode']

        if pad_mode == 'pad':
            padding = resume_config['padding']
        elif pad_mode == 'same':
            padding = 0
        else:
            raise ValueError(f"invalid pad mode: {pad_mode}!")

        best_acc = resume_config['best_acc']
        best_ckpt = resume_config['best_ckpt']
        print('* The best accuracy in dev dataset for the current resumed model is {:.2f}%'.format(best_acc * 100))

    else:
        has_bias = config['has_bias']
        use_dropout = config['use_dropout']

        pad_mode = config['pad_mode']

        if pad_mode == 'pad':
            padding = config['padding']
        elif pad_mode == 'same':
            padding = 0
        else:
            raise ValueError(f"invalid pad mode: {pad_mode}!")

    # hyper-parameters
    if resume:
        batch_size = resume_config['batch_size']
        opt_type = resume_config['opt']
        use_dynamic_lr = resume_config['use_dynamic_lr']
        warmup_step = resume_config['warmup_step']
        warmup_ratio = resume_config['warmup_ratio']
    else:
        batch_size = config['batch_size']
        opt_type = config['opt']
        use_dynamic_lr = config['use_dynamic_lr']
        warmup_step = config['warmup_step']
        warmup_ratio = config['warmup_ratio']

    test_dev_batch_size = config['test_dev_batch_size']
    learning_rate = float(config['learning_rate'])
    epochs = config['epochs']
    loss_scale = config['loss_scale']

    # configuration of saving model checkpoint
    save_checkpoint_steps = config['save_checkpoint_steps']
    keep_checkpoint_max = config['keep_checkpoint_max']
    prefix = config['prefix'] + '_' + date
    model_dir = config['model_dir']

    # loss monitor
    loss_monitor_step = config['loss_monitor_step']

    # whether to use mindInsight summary
    use_summary = config['use_summary']

    # step_eval
    use_step_eval = config['use_step_eval']
    eval_step = config['eval_step']
    eval_epoch = config['eval_epoch']
    patience = config['patience']

    # eval in steps or epochs
    step_eval = True

    if eval_step == -1:
        step_eval = False

    # ************** end of configuration **************
    if device == 'GPU':
        context.set_context(mode=mode, device_target=device, device_id=device_id)
    elif device == 'Ascend':
        import moxing as mox
        from utils.const import DATA_PATH, MODEL_PATH, BEST_MODEL_PATH, LOG_PATH
        obs_datapath = config['obs_datapath']
        obs_saved_model = config['obs_saved_model']
        obs_best_model = config['obs_best_model']
        obs_log = config['obs_log']
        mox.file.copy_parallel(obs_datapath, DATA_PATH)
        mox.file.copy_parallel(MODEL_PATH, obs_saved_model)
        mox.file.copy_parallel(BEST_MODEL_PATH, obs_best_model)
        mox.file.copy_parallel(LOG_PATH, obs_log)
        context.set_context(mode=mode, device_target=device)
        use_summary = False

    # callbacks function
    callbacks = []

    # data
    train_loader, idx2label, label2idx = get_dataset(batch_size=batch_size, phase='train',
                                                     test_dev_batch_size=test_dev_batch_size, div=div,
                                                     num_parallel_workers=4)

    if eval_step == 0:
        eval_step = train_loader.get_dataset_size()

    # network
    net = DFCNN(num_classes=len(label2idx), padding=padding, pad_mode=pad_mode,
                has_bias=has_bias, use_dropout=use_dropout)

    # Criterion
    criterion = CTCLoss()

    # resume
    if resume:
        print("* Loading parameters...")
        param_dict = load_checkpoint(resume_model_path)
        # load the parameter into net
        load_param_into_net(net, param_dict)
        print(f'* Parameters loading from {resume_model_path} succeeded!')

    net.set_train(True)
    net.set_grad(True)

    # lr schedule
    if use_dynamic_lr:
        dataset_size = train_loader.get_dataset_size()
        learning_rate = Tensor(dynamic_lr(base_lr=learning_rate, warmup_step=warmup_step,
                                          warmup_ratio=warmup_ratio, epochs=epochs,
                                          steps_per_epoch=dataset_size), mstype.float32)
        print('* Using dynamic learning rate, which will be set up as :', learning_rate.asnumpy())

    # optim
    if opt_type == 'adam':
        opt = nn.Adam(net.trainable_params(), learning_rate=learning_rate, beta1=0.9, beta2=0.999, weight_decay=0.0,
                      eps=10e-8)
    elif opt_type == 'rms':
        opt = nn.RMSProp(params=net.trainable_params(),
                         centered=True,
                         learning_rate=learning_rate,
                         momentum=0.9,
                         loss_scale=loss_scale)
    elif opt_type == 'sgd':
        opt = nn.SGD(params=net.trainable_params(), learning_rate=learning_rate)
    else:
        raise ValueError(f"optimizer: {opt_type} is not supported for now!")

    if resume:
        # load the parameter into optimizer
        load_param_into_net(opt, param_dict)

    # save_model
    config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(prefix=prefix, directory=model_dir, config=config_ck)

    # logger
    the_logger = logger(config, date)
    log = Logging(logger=the_logger, model_ckpt=ckpt_cb)

    callbacks.append(ckpt_cb)
    callbacks.append(log)

    net = WithLossCell(net, criterion)
    scaling_sens = Tensor(np.full((1), loss_scale), dtype=mstype.float32)

    net = DFCNNCTCTrainOneStepWithLossScaleCell(net, opt, scaling_sens)
    net.set_train(True)
    model = Model(net)

    if use_step_eval:
        # step evaluation
        step_eval = StepAccInfo(model=model, name=prefix, div=div, test_dev_batch_size=test_dev_batch_size,
                                step_eval=step_eval, eval_step=eval_step, eval_epoch=eval_epoch,
                                logger=the_logger, patience=patience, dataset_size=train_loader.get_dataset_size())

        callbacks.append(step_eval)

    # loss monitor
    loss_monitor = LossMonitor(loss_monitor_step)

    callbacks.append(loss_monitor)

    if use_summary:
        summary_dir = os.path.join(SUMMARY_DIR, date)
        if not os.path.exists(summary_dir):
            os.mkdir(summary_dir)
        # mindInsight
        summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=1, max_file_size=4 * 1024 ** 3)
        callbacks.append(summary_collector)

    if resume:
        the_logger.update_acc_ckpt(best_acc, best_ckpt)

    print(f'* Start training...')
    model.train(epochs,
                train_loader,
                callbacks=callbacks,
                dataset_sink_mode=dataset_sink_mode)