Beispiel #1
0
def main(args):
    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))

    startup_prog = fluid.Program()
    infer_prog = fluid.Program()

    infer_fetch_list = build_program(
        main_prog=infer_prog, startup_prog=startup_prog, args=args)

    infer_prog = infer_prog.clone(for_test=True)
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    valid_reader = reader.train_valid(
        batch_size=args.batch_size, is_train=False, is_shuffle=False, args=args)
    fluid.io.load_persistables(exe, args.model_dir, main_program=infer_prog)
    infer_prog = fluid.CompiledProgram(infer_prog)

    top1 = infer(infer_prog, exe, valid_reader, infer_fetch_list, args)
    logger.info("test_acc {:.6f}".format(top1))
Beispiel #2
0
def main(args):
    if not args.use_gpu:
        place = fluid.CPUPlace()
    elif not args.use_data_parallel:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)

    train_reader = reader.train_valid(batch_size=args.batch_size,
                                      is_train=True,
                                      is_shuffle=True)

    valid_reader = reader.train_valid(batch_size=args.batch_size,
                                      is_train=False,
                                      is_shuffle=False)

    with fluid.dygraph.guard(place):
        models = [MobileNetV1(), MobileNetV1()]
        parallel_models = None
        if args.use_data_parallel:
            strategy = fluid.dygraph.parallel.prepare_context()
            parallel_models = [
                fluid.dygraph.parallel.DataParallel(model, strategy)
                for model in models
            ]
            train_reader = fluid.contrib.reader.distributed_batch_reader(
                train_reader)

        train_loader = fluid.io.DataLoader.from_generator(
            capacity=1024,
            use_double_buffer=True,
            iterable=True,
            return_list=True,
            use_multiprocess=True)
        valid_loader = fluid.io.DataLoader.from_generator(
            capacity=1024,
            use_double_buffer=True,
            iterable=True,
            return_list=True,
            use_multiprocess=True)
        train_loader.set_batch_generator(train_reader, places=place)
        valid_loader.set_batch_generator(valid_reader, places=place)
        dataloaders = [train_loader, valid_loader]

        device_num = fluid.dygraph.parallel.Env().nranks
        step = int(args.trainset_num / (args.batch_size * device_num))
        epochs = [60, 120, 180]
        bd = [step * e for e in epochs]
        lr = [args.init_lr * (0.1**i) for i in range(len(bd) + 1)]

        lr_a = fluid.dygraph.PiecewiseDecay(bd, lr, 0)
        lr_b = fluid.dygraph.PiecewiseDecay(bd, lr, 0)
        opt_a = fluid.optimizer.MomentumOptimizer(
            lr_a,
            0.9,
            parameter_list=models[0].parameters(),
            use_nesterov=True,
            regularization=fluid.regularizer.L2DecayRegularizer(5e-4))
        opt_b = fluid.optimizer.MomentumOptimizer(
            lr_b,
            0.9,
            parameter_list=models[1].parameters(),
            use_nesterov=True,
            regularization=fluid.regularizer.L2DecayRegularizer(5e-4))
        optimizers = [opt_a, opt_b]
        trainer = Trainer(models, parallel_models, optimizers, dataloaders,
                          args.epochs, args.log_freq)
        trainer.train()
Beispiel #3
0
def main(args):
    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
        if args.use_data_parallel else fluid.CUDAPlace(0)

    with fluid.dygraph.guard(place):
        genotype = eval("genotypes.%s" % args.arch)
        model = Network(
            C=args.init_channels,
            num_classes=args.class_num,
            layers=args.layers,
            auxiliary=args.auxiliary,
            genotype=genotype)

        logger.info("param size = {:.6f}MB".format(
            count_parameters_in_MB(model.parameters())))

        device_num = fluid.dygraph.parallel.Env().nranks
        step_per_epoch = int(args.trainset_num /
                             (args.batch_size * device_num))
        learning_rate = fluid.dygraph.CosineDecay(args.learning_rate,
                                                  step_per_epoch, args.epochs)
        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip)
        optimizer = fluid.optimizer.MomentumOptimizer(
            learning_rate,
            momentum=args.momentum,
            regularization=fluid.regularizer.L2Decay(args.weight_decay),
            parameter_list=model.parameters(),
            grad_clip=clip)

        if args.use_data_parallel:
            strategy = fluid.dygraph.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)

        train_loader = fluid.io.DataLoader.from_generator(
            capacity=64,
            use_double_buffer=True,
            iterable=True,
            return_list=True,
            use_multiprocess=args.use_multiprocess)
        valid_loader = fluid.io.DataLoader.from_generator(
            capacity=64,
            use_double_buffer=True,
            iterable=True,
            return_list=True,
            use_multiprocess=args.use_multiprocess)

        train_reader = reader.train_valid(
            batch_size=args.batch_size,
            is_train=True,
            is_shuffle=True,
            args=args)
        valid_reader = reader.train_valid(
            batch_size=args.batch_size,
            is_train=False,
            is_shuffle=False,
            args=args)
        if args.use_data_parallel:
            train_reader = fluid.contrib.reader.distributed_batch_reader(
                train_reader)

        train_loader.set_batch_generator(train_reader, places=place)
        valid_loader.set_batch_generator(valid_reader, places=place)

        save_parameters = (not args.use_data_parallel) or (
            args.use_data_parallel and
            fluid.dygraph.parallel.Env().local_rank == 0)
        best_acc = 0
        for epoch in range(args.epochs):
            drop_path_prob = args.drop_path_prob * epoch / args.epochs
            logger.info('Epoch {}, lr {:.6f}'.format(
                epoch, optimizer.current_step_lr()))
            train_top1 = train(model, train_loader, optimizer, epoch,
                               drop_path_prob, args)
            logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
            valid_top1 = valid(model, valid_loader, epoch, args)
            if valid_top1 > best_acc:
                best_acc = valid_top1
                if save_parameters:
                    fluid.save_dygraph(model.state_dict(),
                                       args.model_save_dir + "/best_model")
            logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:.6f}".
                        format(epoch, valid_top1, best_acc))
Beispiel #4
0
def main(args):
    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    is_shuffle = True

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    train_fetch_list, train_loader = build_program(main_prog=train_prog,
                                                   startup_prog=startup_prog,
                                                   is_train=True,
                                                   args=args)
    valid_fetch_list, valid_loader = build_program(main_prog=test_prog,
                                                   startup_prog=startup_prog,
                                                   is_train=False,
                                                   args=args)

    logger.info("param size = {:.6f}MB".format(
        utility.count_parameters_in_MB(
            train_prog.global_block().all_parameters(), 'model')))
    test_prog = test_prog.clone(for_test=True)
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    train_reader = reader.train_valid(batch_size=args.batch_size,
                                      is_train=True,
                                      is_shuffle=is_shuffle,
                                      args=args)
    valid_reader = reader.train_valid(batch_size=args.batch_size,
                                      is_train=False,
                                      is_shuffle=False,
                                      args=args)

    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    train_loader.set_batch_generator(train_reader, places=places)
    valid_loader.set_batch_generator(valid_reader, places=place)

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 4 * devices_num
    build_strategy = fluid.BuildStrategy()
    if args.with_mem_opt:
        for i in range(len(train_fetch_list)):
            train_fetch_list[i].persistable = True
        build_strategy.enable_inplace = True
        build_strategy.memory_optimize = True

    parallel_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=train_fetch_list[0].name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)
    test_prog = fluid.CompiledProgram(test_prog)

    def save_model(postfix, program):
        model_path = os.path.join(args.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        logger.info('save models to %s' % (model_path))
        fluid.io.save_persistables(exe, model_path, main_program=program)

    best_acc = 0
    for epoch_id in range(args.epochs):
        train_top1 = train(parallel_train_prog, exe, epoch_id, train_loader,
                           train_fetch_list, args)
        logger.info("Epoch {}, train_acc {:.6f}".format(epoch_id, train_top1))
        valid_top1 = valid(test_prog, exe, epoch_id, valid_loader,
                           valid_fetch_list, args)
        if valid_top1 > best_acc:
            best_acc = valid_top1
            save_model('cifar10_model', train_prog)
        logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:.6f}".format(
            epoch_id, valid_top1, best_acc))