コード例 #1
0
ファイル: train.py プロジェクト: yrpang/mindspore
def train():
    """Train function."""
    args = get_args("train")
    if args.need_profiler:
        from mindspore.profiler.profiling import Profiler
        profiler = Profiler(output_path=args.outputs_dir,
                            is_detail=True,
                            is_show_op_path=True)
    ds = create_dataset(args)
    G_A = get_generator(args)
    G_B = get_generator(args)
    D_A = get_discriminator(args)
    D_B = get_discriminator(args)
    load_ckpt(args, G_A, G_B, D_A, D_B)
    imgae_pool_A = ImagePool(args.pool_size)
    imgae_pool_B = ImagePool(args.pool_size)
    generator = Generator(G_A, G_B, args.lambda_idt > 0)

    loss_D = DiscriminatorLoss(args, D_A, D_B)
    loss_G = GeneratorLoss(args, generator, D_A, D_B)
    optimizer_G = nn.Adam(generator.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)
    optimizer_D = nn.Adam(loss_D.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)

    net_G = TrainOneStepG(loss_G, generator, optimizer_G)
    net_D = TrainOneStepD(loss_D, optimizer_D)

    data_loader = ds.create_dict_iterator()
    reporter = Reporter(args)
    reporter.info('==========start training===============')
    for _ in range(args.max_epoch):
        reporter.epoch_start()
        for data in data_loader:
            img_A = data["image_A"]
            img_B = data["image_B"]
            res_G = net_G(img_A, img_B)
            fake_A = res_G[0]
            fake_B = res_G[1]
            res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A),
                          imgae_pool_B.query(fake_B))
            reporter.step_end(res_G, res_D)
            reporter.visualizer(img_A, img_B, fake_A, fake_B)
        reporter.epoch_end(net_G)
        if args.need_profiler:
            profiler.analyse()
            break

    reporter.info('==========end training===============')
コード例 #2
0
ファイル: train.py プロジェクト: mindspore-ai/course
def train():
    """Train function."""

    args.outputs_dir = params['save_model_path']

    if args.group_size > 1:
        init()
        context.set_auto_parallel_context(
            device_num=get_group_size(),
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        args.outputs_dir = os.path.join(args.outputs_dir,
                                        "ckpt_{}/".format(str(get_rank())))
        args.rank = get_rank()
    else:
        args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/")
        args.rank = 0

    if args.group_size > 1:
        args.max_epoch = params["max_epoch_train_NP"]
        args.loss_scale = params['loss_scale'] / 2
        args.lr_steps = list(map(int, params["lr_steps_NP"].split(',')))
        params['train_type'] = params['train_type_NP']
        params['optimizer'] = params['optimizer_NP']
        params['group_params'] = params['group_params_NP']
    else:
        args.max_epoch = params["max_epoch_train"]
        args.loss_scale = params['loss_scale']
        args.lr_steps = list(map(int, params["lr_steps"].split(',')))

    # create network
    print('start create network')
    criterion = openpose_loss()
    criterion.add_flags_recursive(fp32=True)
    network = OpenPoseNet(vggpath=params['vgg_path'],
                          vgg_with_bn=params['vgg_with_bn'])
    if params["load_pretrain"]:
        print("load pretrain model:", params["pretrained_model_path"])
        load_model(network, params["pretrained_model_path"])
    train_net = BuildTrainNetwork(network, criterion)

    # create dataset
    if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \
            and os.path.exists(args.maskpath_train):
        print('start create dataset')
    else:
        print('Error: wrong data path')
        return 0

    num_worker = 20 if args.group_size > 1 else 48
    de_dataset_train = create_dataset(args.jsonpath_train,
                                      args.imgpath_train,
                                      args.maskpath_train,
                                      batch_size=params['batch_size'],
                                      rank=args.rank,
                                      group_size=args.group_size,
                                      num_worker=num_worker,
                                      multiprocessing=True,
                                      shuffle=True,
                                      repeat_num=1)
    steps_per_epoch = de_dataset_train.get_dataset_size()
    print("steps_per_epoch: ", steps_per_epoch)

    # lr scheduler
    lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size,
                                       params['lr_gamma'],
                                       steps_per_epoch,
                                       args.max_epoch,
                                       args.lr_steps,
                                       args.group_size,
                                       lr_type=params['lr_type'],
                                       warmup_epoch=params['warmup_epoch'])

    # optimizer
    if params['group_params']:
        vgg19_base_params = list(
            filter(lambda x: 'base.vgg_base' in x.name,
                   train_net.trainable_params()))
        base_params = list(
            filter(lambda x: 'base.conv' in x.name,
                   train_net.trainable_params()))
        stages_params = list(
            filter(lambda x: 'base' not in x.name,
                   train_net.trainable_params()))

        group_params = [{
            'params': vgg19_base_params,
            'lr': lr_vgg
        }, {
            'params': base_params,
            'lr': lr_base
        }, {
            'params': stages_params,
            'lr': lr_stage
        }]

        if params['optimizer'] == "Momentum":
            opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(group_params)
        else:
            raise ValueError("optimizer not support.")
    else:
        if params['optimizer'] == "Momentum":
            opt = Momentum(train_net.trainable_params(),
                           learning_rate=lr_stage,
                           momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(train_net.trainable_params(), learning_rate=lr_stage)
        else:
            raise ValueError("optimizer not support.")

    # callback
    config_ck = CheckpointConfig(
        save_checkpoint_steps=params['ckpt_interval'],
        keep_checkpoint_max=params["keep_checkpoint_max"])
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank),
                                 directory=args.outputs_dir,
                                 config=config_ck)
    time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size())
    if args.rank == 0:
        callback_list = [MyLossMonitor(), time_cb, ckpoint_cb]
    else:
        callback_list = [MyLossMonitor(), time_cb]

    # train
    if params['train_type'] == 'clip_grad':
        train_net = TrainOneStepWithClipGradientCell(train_net,
                                                     opt,
                                                     sens=args.loss_scale)
        train_net.set_train()
        model = Model(train_net)
    elif params['train_type'] == 'fix_loss_scale':
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        train_net.set_train()
        model = Model(train_net,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager)
    else:
        raise ValueError("Type {} is not support.".format(
            params['train_type']))

    print("============== Starting Training ==============")
    model.train(args.max_epoch,
                de_dataset_train,
                callbacks=callback_list,
                dataset_sink_mode=False)
    return 0
コード例 #3
0
ファイル: train.py プロジェクト: yrpang/mindspore
def main():
    """Main entrance for training"""
    args = parser.parse_args()
    print(sys.argv)
    devid, args.rank_id, args.rank_size = 0, 0, 1

    context.set_context(mode=context.GRAPH_MODE)

    if args.distributed:
        if args.GPU:
            init("nccl")
            context.set_context(device_target='GPU')
        else:
            init()
            devid = int(os.getenv('DEVICE_ID'))
            context.set_context(device_target='Ascend',
                                device_id=devid,
                                reserve_class_name_in_scope=False)
        context.reset_auto_parallel_context()
        args.rank_id = get_rank()
        args.rank_size = get_group_size()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            device_num=args.rank_size)
    else:
        if args.GPU:
            context.set_context(device_target='GPU')

    is_master = not args.distributed or (args.rank_id == 0)

    # parse model argument
    assert args.model.startswith(
        "tinynet"), "Only Tinynet models are supported."
    _, sub_name = args.model.split("_")
    net = tinynet(sub_model=sub_name,
                  num_classes=args.num_classes,
                  drop_rate=args.drop,
                  drop_connect_rate=args.drop_connect,
                  global_pool="avg",
                  bn_tf=args.bn_tf,
                  bn_momentum=args.bn_momentum,
                  bn_eps=args.bn_eps)

    if is_master:
        print("Total number of parameters:", count_params(net))
    # input image size of the network
    input_size = net.default_cfg['input_size'][1]

    train_dataset = val_dataset = None
    train_data_url = os.path.join(args.data_path, 'train')
    val_data_url = os.path.join(args.data_path, 'val')
    val_dataset = create_dataset_val(args.batch_size,
                                     val_data_url,
                                     workers=args.workers,
                                     distributed=False,
                                     input_size=input_size)

    if args.train:
        train_dataset = create_dataset(args.batch_size,
                                       train_data_url,
                                       workers=args.workers,
                                       distributed=args.distributed,
                                       input_size=input_size)
        batches_per_epoch = train_dataset.get_dataset_size()

    loss = LabelSmoothingCrossEntropy(smooth_factor=args.smoothing,
                                      num_classes=args.num_classes)
    time_cb = TimeMonitor(data_size=batches_per_epoch)
    loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                               drop_overflow_update=False)

    lr_array = get_lr(base_lr=args.lr,
                      total_epochs=args.epochs,
                      steps_per_epoch=batches_per_epoch,
                      decay_epochs=args.decay_epochs,
                      decay_rate=args.decay_rate,
                      warmup_epochs=args.warmup_epochs,
                      warmup_lr_init=args.warmup_lr,
                      global_epoch=0)
    lr = Tensor(lr_array)

    loss_cb = LossMonitor(lr_array,
                          args.epochs,
                          per_print_times=args.per_print_times,
                          start_epoch=0)

    param_group = add_weight_decay(net, weight_decay=args.weight_decay)

    if args.opt == 'sgd':
        if is_master:
            print('Using SGD optimizer')
        optimizer = SGD(param_group,
                        learning_rate=lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay,
                        loss_scale=args.loss_scale)

    elif args.opt == 'rmsprop':
        if is_master:
            print('Using rmsprop optimizer')
        optimizer = RMSProp(param_group,
                            learning_rate=lr,
                            decay=0.9,
                            weight_decay=args.weight_decay,
                            momentum=args.momentum,
                            epsilon=args.opt_eps,
                            loss_scale=args.loss_scale)

    loss.add_flags_recursive(fp32=True, fp16=False)
    eval_metrics = {
        'Validation-Loss': Loss(),
        'Top1-Acc': Top1CategoricalAccuracy(),
        'Top5-Acc': Top5CategoricalAccuracy()
    }

    if args.ckpt:
        ckpt = load_checkpoint(args.ckpt)
        load_param_into_net(net, ckpt)
        net.set_train(False)

    model = Model(net,
                  loss,
                  optimizer,
                  metrics=eval_metrics,
                  loss_scale_manager=loss_scale_manager,
                  amp_level=args.amp_level)

    net_ema = copy.deepcopy(net)
    net_ema.set_train(False)
    assert args.ema_decay > 0, "EMA should be used in tinynet training."

    ema_cb = EmaEvalCallBack(network=net,
                             ema_network=net_ema,
                             loss_fn=loss,
                             eval_dataset=val_dataset,
                             decay=args.ema_decay,
                             save_epoch=args.ckpt_save_epoch,
                             dataset_sink_mode=args.dataset_sink,
                             start_epoch=0)

    callbacks = [loss_cb, ema_cb, time_cb] if is_master else []

    if is_master:
        print("Training on " + args.model + " with " + str(args.num_classes) +
              " classes")

    model.train(args.epochs,
                train_dataset,
                callbacks=callbacks,
                dataset_sink_mode=args.dataset_sink)
コード例 #4
0
            "============ Precision is lower than expected when using vanilla RNN architecture ==========="
        )

    embedding_table = np.loadtxt(
        os.path.join(cfg.preprocess_path, "weight.txt")).astype(np.float32)

    network = textrcnn(weight=Tensor(embedding_table),
                       vocab_size=embedding_table.shape[0],
                       cell=cfg.cell,
                       batch_size=cfg.batch_size)

    ds_train = create_dataset(cfg.preprocess_path, cfg.batch_size, True)
    step_size = ds_train.get_dataset_size()

    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
    lr = get_lr(cfg, step_size)
    num_epochs = cfg.num_epochs
    if cfg.cell == "lstm":
        num_epochs = cfg.lstm_num_epochs

    opt = nn.Adam(params=network.trainable_params(), learning_rate=lr)

    loss_cb = LossMonitor()
    time_cb = TimeMonitor()
    model = Model(network, loss, opt, {'acc': Accuracy()}, amp_level="O3")

    print("============== Starting Training ==============")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=cfg.cell,
コード例 #5
0
def train(args):
    # the number of N way, K shot images
    k = args.nway * args.kshot

    # Train data loading
    dataset = Dataset(args.dpath, state='train')
    train_sampler = Train_Sampler(dataset._labels,
                                  n_way=args.nway,
                                  k_shot=args.kshot,
                                  query=args.query)
    data_loader = DataLoader(dataset=dataset,
                             batch_sampler=train_sampler,
                             num_workers=4,
                             pin_memory=True)

    # Validation data loading
    val_dataset = Dataset(args.dpath, state='val')
    val_sampler = Sampler(val_dataset._labels,
                          n_way=args.nway,
                          k_shot=args.kshot,
                          query=args.query)
    val_data_loader = DataLoader(dataset=val_dataset,
                                 batch_sampler=val_sampler,
                                 num_workers=4,
                                 pin_memory=True)
    """ TODO 1.a """
    " Make your own model for Few-shot Classification in 'model.py' file."
    # model setting
    #model = FewShotModel()
    model = FewShotModel_ensemble()
    """ TODO 1.a END """

    # pretrained model load
    if args.restore_ckpt is not None:
        state_dict = torch.load(args.restore_ckpt)
        model.load_state_dict(state_dict)

    model.cuda()
    model.train()

    if args.test_mode == 1:
        Test_phase(model, args, k)
    """ TODO 1.b (optional) """
    " Set an optimizer or scheduler for Few-shot classification (optional) "
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    optimizer = torch.optim.SGD(model.parameters(), lr=4e-3, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=100,
                                                gamma=0.95)
    ce_loss_fn = torch.nn.CrossEntropyLoss().cuda()
    print('Loss mode: ', args.mymode)
    """ TODO 1.b (optional) END """

    tl = Averager()  # save average loss
    ta = Averager()  # save average accuracy

    # training start
    print('train start')
    for i in range(TOTAL):
        for episode in data_loader:
            optimizer.zero_grad()

            data, label = [_.cuda() for _ in episode]  # load an episode

            # split an episode images and labels into shots and query set
            # note! data_shot shape is ( nway * kshot, 3, h, w ) not ( kshot * nway, 3, h, w )
            # Take care when reshape the data shot
            data_shot, data_query = data[:k], data[k:]

            label_shot, label_query = label[:k], label[k:]
            label_shot = sorted(list(set(label_shot.tolist())))

            # convert labels into 0-4 values
            label_query = label_query.tolist()
            labels = []
            for j in range(len(label_query)):
                label = label_shot.index(label_query[j])
                labels.append(label)
            labels = torch.tensor(labels).cuda()
            """ TODO 2 ( Same as above TODO 2 ) """
            """ Train the model 
            Input:
                data_shot : torch.tensor, shot images, [args.nway * args.kshot, 3, h, w]
                            be careful when using torch.reshape or .view functions
                data_query : torch.tensor, query images, [args.query, 3, h, w]
                labels : torch.tensor, labels of query images, [args.query]
            output:
                loss : torch scalar tensor which used for updating your model
                logits : A value to measure accuracy and loss
            """
            # The loss_mode function is in "src/utils.py"
            logits, loss = loss_mode(args, model, data_shot, data_query,
                                     labels)
            """ TODO 2 END """

            acc = count_acc(logits, labels)

            tl.add(loss.item())
            ta.add(acc)

            loss.backward()
            optimizer.step()
            scheduler.step()  # @@!!@@ added by nam
            proto = None
            logits = None
            loss = None

        if (i + 1) % PRINT_FREQ == 0:

            print('train {}, lr={:.4e} loss={:.4f} acc={:.4f}'.format(
                i + 1, get_lr(optimizer), tl.item(), ta.item()))

            # initialize loss and accuracy mean
            tl = None
            ta = None
            tl = Averager()
            ta = Averager()

        # validation start
        if (i + 1) % VAL_FREQ == 0:
            print('validation start')
            model.eval()
            with torch.no_grad():
                vl = Averager()  # save average loss
                va = Averager()  # save average accuracy
                for j in range(VAL_TOTAL):
                    for episode in val_data_loader:
                        data, label = [_.cuda() for _ in episode]

                        data_shot, data_query = data[:k], data[
                            k:]  # load an episode

                        label_shot, label_query = label[:k], label[k:]
                        label_shot = sorted(list(set(label_shot.tolist())))

                        label_query = label_query.tolist()

                        labels = []
                        for j in range(len(label_query)):
                            label = label_shot.index(label_query[j])
                            labels.append(label)
                        labels = torch.tensor(labels).cuda()
                        """ TODO 2 ( Same as above TODO 2 ) """
                        """ Train the model 
                        Input:
                            data_shot : torch.tensor, shot images, [args.nway * args.kshot, 3, h, w]
                                        be careful when using torch.reshape or .view functions
                            data_query : torch.tensor, query images, [args.query, 3, h, w]
                            labels : torch.tensor, labels of query images, [args.query]
                        output:
                            loss : torch scalar tensor which used for updating your model
                            logits : A value to measure accuracy and loss
                        """
                        # The loss_mode function is in "src/utils.py"
                        logits, loss = loss_mode(args, model, data_shot,
                                                 data_query, labels)
                        """ TODO 2 END """

                        acc = count_acc(logits, labels)

                        vl.add(loss.item())
                        va.add(acc)

                        proto = None
                        logits = None
                        loss = None

                print('val accuracy mean : %.4f' % va.item())
                print('val loss mean : %.4f' % vl.item())

                # initialize loss and accuracy mean
                vl = None
                va = None
                vl = Averager()
                va = Averager()
            model.train()

        if (i + 1) % SAVE_FREQ == 0:
            PATH = 'checkpoints/%d_%s.pth' % (i + 1, args.name)
            torch.save(model.state_dict(), PATH)
            print('model saved, iteration : %d' % i)
コード例 #6
0
def train():
    """Train function."""
    args = parse_args()

    args.outputs_dir = params['save_model_path']

    if args.group_size > 1:
        init()
        context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
        args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_{}/".format(str(get_rank())))
        args.rank = get_rank()
    else:
        args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/")
        args.rank = 0

    # with out loss_scale
    if args.group_size > 1:
        args.loss_scale = params['loss_scale'] / 2
        args.lr_steps = list(map(int, params["lr_steps_NP"].split(',')))
    else:
        args.loss_scale = params['loss_scale']
        args.lr_steps = list(map(int, params["lr_steps"].split(',')))

    # create network
    print('start create network')
    criterion = openpose_loss()
    criterion.add_flags_recursive(fp32=True)
    network = OpenPoseNet(vggpath=params['vgg_path'])
    # network.add_flags_recursive(fp32=True)

    if params["load_pretrain"]:
        print("load pretrain model:", params["pretrained_model_path"])
        load_model(network, params["pretrained_model_path"])
    train_net = BuildTrainNetwork(network, criterion)

    # create dataset
    if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \
            and os.path.exists(args.maskpath_train):
        print('start create dataset')
    else:
        print('Error: wrong data path')


    num_worker = 20 if args.group_size > 1 else 48
    de_dataset_train = create_dataset(args.jsonpath_train, args.imgpath_train, args.maskpath_train,
                                      batch_size=params['batch_size'],
                                      rank=args.rank,
                                      group_size=args.group_size,
                                      num_worker=num_worker,
                                      multiprocessing=True,
                                      shuffle=True,
                                      repeat_num=1)
    steps_per_epoch = de_dataset_train.get_dataset_size()
    print("steps_per_epoch: ", steps_per_epoch)

    # lr scheduler
    lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size,
                                       params['lr_gamma'],
                                       steps_per_epoch,
                                       params["max_epoch_train"],
                                       args.lr_steps,
                                       args.group_size)
    vgg19_base_params = list(filter(lambda x: 'base.vgg_base' in x.name, train_net.trainable_params()))
    base_params = list(filter(lambda x: 'base.conv' in x.name, train_net.trainable_params()))
    stages_params = list(filter(lambda x: 'base' not in x.name, train_net.trainable_params()))

    group_params = [{'params': vgg19_base_params, 'lr': lr_vgg},
                    {'params': base_params, 'lr': lr_base},
                    {'params': stages_params, 'lr': lr_stage}]

    opt = Adam(group_params, loss_scale=args.loss_scale)

    train_net.set_train(True)
    loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)

    model = Model(train_net, optimizer=opt, loss_scale_manager=loss_scale_manager)

    params['ckpt_interval'] = max(steps_per_epoch, params['ckpt_interval'])
    config_ck = CheckpointConfig(save_checkpoint_steps=params['ckpt_interval'],
                                 keep_checkpoint_max=params["keep_checkpoint_max"])
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank), directory=args.outputs_dir, config=config_ck)
    time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size())
    callback_list = [MyLossMonitor(), time_cb, ckpoint_cb]
    print("============== Starting Training ==============")
    model.train(params["max_epoch_train"], de_dataset_train, callbacks=callback_list,
                dataset_sink_mode=False)