Example #1
0
    if options.action == "train":
        batch_size = int(options.batchSize)
        embedding_dim = int(options.embedding_dim)
        learning_rate = float(options.learning_rate)
        max_epoch = int(options.max_epoch)
        p = float(options.p)
        model_type = options.model_type
        sequence_len = 500
        max_words = 5000
        training_split = 0.8
        data_path = options.data_path
        if options.onYarn:
            hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
            assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
                    "set the environment variable HADOOP_CONF_DIR"
            conda_env_name = detect_conda_env_name()
            sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
                                    conda_name=conda_env_name,
                                    num_executors=2,
                                    executor_cores=2,
                                    executor_memory="20g",
                                    driver_memory="10g")
        else:
            conf = {"spark.driver.memory": "40g"}
            sc = init_spark_on_local(cores=4, conf=conf)

        set_optimizer_version(options.optimizerVersion)
        train(sc, data_path, batch_size, sequence_len, max_words,
              embedding_dim, training_split)
        sc.stop()
    elif options.action == "test":
Example #2
0
def init_nncontext(conf=None, cluster_mode="spark-submit", spark_log_level="WARN", redirect_spark_log=True, **kwargs):
    """
    Creates or gets a SparkContext with optimized configurations for BigDL performance.
    This method will also initialize the BigDL engine.

    Note: If you use spark-shell or Jupyter notebook, as the SparkContext is created
    before your code, you have to set the Spark configurations through command line options
    or the properties file before calling this method. In this case, you are recommended
    to use the launch scripts we provide:
    https://github.com/intel-analytics/analytics-zoo/tree/master/scripts.

    :param conf: An instance of SparkConf. If not specified, a new SparkConf with
           Analytics Zoo and BigDL configurations would be created and used.
           You can also input a string here to indicate the name of the application.
    :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client",
       "yarn-cluster", "k8s-client", "standalone" and "spark-submit". Default to be "local".

       For "spark-submit", you are supposed to use spark-submit to submit the application.
       In this case, please set the Spark configurations through command line options or
       the properties file. You need to use "spark-submit" for yarn-cluster or k8s-cluster mode.
       To make things easier, you are recommended to use the launch scripts we provide:
       https://github.com/intel-analytics/analytics-zoo/tree/master/scripts.

       For other cluster modes, you are recommended to install and run analytics-zoo through
       pip, which is more convenient.
    :param spark_log_level: The log level for Spark. Default to be 'WARN'.
    :param redirect_spark_log: Whether to redirect the Spark log to local file. Default to be True.

    :return: An instance of SparkContext.
    """
    cluster_mode = cluster_mode.lower()
    memory = "2g"
    cores = 2
    num_nodes = 1

    spark_args = {}
    spark_args["spark_log_level"] = spark_log_level
    spark_args["redirect_spark_log"] = redirect_spark_log
    if conf and not isinstance(conf, six.string_types):
        memory = conf.get("spark.executor.memory", "2g")
        if conf.get("spark.executor.cores"):
            cores = conf.get("spark.executor.cores")
        if conf.get("spark.executor.instances"):
            num_nodes = conf.get("spark.executor.instances")
        spark_args.update(conf.getAll())
    if cluster_mode == "spark-submit":
        sc = init_internal_nncontext(conf, spark_log_level, redirect_spark_log)
    elif cluster_mode == "local":
        if conf:
            os.environ["SPARK_DRIVER_MEMORY"] = conf.get("spark.driver.memory")
        else:
            os.environ["SPARK_DRIVER_MEMORY"] = memory

        python_location = None
        if "python_location" in kwargs:
            python_location = kwargs["python_location"]
        sc = init_spark_on_local(2, spark_args, python_location, spark_log_level,
                                 redirect_spark_log)
    elif cluster_mode in ("yarn-client", "yarn-cluster"):  # yarn-cluster or yarn-client
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        if not hadoop_conf:
            assert "hadoop_conf" in kwargs, \
                "Directory path to hadoop conf not found for yarn-client mode. Please either " \
                "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR"
            hadoop_conf = kwargs["hadoop_conf"]
        from bigdl.dllib.utils.utils import detect_conda_env_name

        conda_env_name = detect_conda_env_name()
        for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray",
                    "extra_python_lib", "penv_archive", "additional_archive",
                    "hadoop_user_name", "spark_yarn_archive", "jars"]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        if cluster_mode == "yarn-client":
            from bigdl.dllib.nncontext import init_spark_on_yarn
            sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
                                    conda_name=conda_env_name,
                                    num_executors=num_nodes, executor_cores=cores,
                                    executor_memory=memory, conf=spark_args)
        else:
            sc = init_spark_on_yarn_cluster(hadoop_conf=hadoop_conf,
                                            conda_name=conda_env_name,
                                            num_executors=num_nodes,
                                            executor_cores=cores,
                                            executor_memory=memory,
                                            conf=spark_args)
    elif cluster_mode.startswith("k8s"):  # k8s or k8s-client
        if cluster_mode == "k8s-cluster":
            raise ValueError('For k8s-cluster mode, please set cluster_mode to "spark-submit" '
                             'and submit the application via spark-submit instead')
        assert "master" in kwargs, "Please specify master for k8s-client mode"
        assert "container_image" in kwargs, "Please specify container_image for k8s-client mode"
        for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray",
                    "extra_python_lib", "jars", "python_location"]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from bigdl.dllib.nncontext import init_spark_on_k8s

        sc = init_spark_on_k8s(master=kwargs["master"],
                               container_image=kwargs["container_image"],
                               num_executors=num_nodes, executor_cores=cores,
                               executor_memory=memory, **spark_args)
    elif cluster_mode == "standalone":
        for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray",
                    "extra_python_lib", "jars", "master", "python_location", "enable_numa_binding"]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from bigdl.dllib.nncontext import init_spark_standalone

        sc = init_spark_standalone(num_executors=num_nodes, executor_cores=cores,
                                   executor_memory=memory, **spark_args)
    else:
        raise ValueError("cluster_mode can only be local, yarn-client, yarn-cluster, standalone or spark-submit, "
                         "but got: %s".format(cluster_mode))
    return sc
Example #3
0
        return x


if __name__ == '__main__':

    if len(sys.argv) != 2:
        print(sys.argv)
        print("Need parameters: <imagePath>")
        exit(-1)

    hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')

    if hadoop_conf_dir:
        num_executors = 2
        num_cores_per_executor = 4
        zoo_conda_name = detect_conda_env_name()  # auto detect current conda env name
        sc = init_spark_on_yarn(
            hadoop_conf=hadoop_conf_dir,
            conda_name=zoo_conda_name,
            num_executors=num_executors,
            executor_cores=num_cores_per_executor,
            executor_memory="8g",
            driver_memory="2g",
            driver_cores=1)
    else:
        num_cores_per_executor = 4
        sc = init_spark_on_local(cores=num_cores_per_executor, conf={"spark.driver.memory": "10g"})

    model = CatDogModel()
    zoo_model = TorchModel.from_pytorch(model)
Example #4
0
def init_orca_context(cluster_mode=None,
                      cores=2,
                      memory="2g",
                      num_nodes=1,
                      init_ray_on_spark=False,
                      **kwargs):
    """
    Creates or gets a SparkContext for different Spark cluster modes (and launch Ray services
    across the cluster if necessary).

    :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client",
           "yarn-cluster", "k8s-client" and "standalone". Default to be None and in this case 
           there is supposed to be an existing SparkContext in your application.
           
           For "yarn-client" and "yarn-cluster", you are supposed to use conda environment 
           and set the environment variable HADOOP_CONF_DIR.

           For "k8s-client", you are supposed to additionally specify the arguments master 
           and container_image.
           For "k8s-cluster", you are supposed to use spark-submit to submit the application 
           and use the default cluster_mode instead.
           In this case, please set the Spark configurations through command line options or
           the properties file. 
           To make things easier, you are recommended to use the launch scripts we provide:
           https://github.com/intel-analytics/BigDL/tree/branch-2.0/scripts.

           For other cluster modes, you are recommended to install and run bigdl through
           pip, which is more convenient.
    :param cores: The number of cores to be used on each node. Default to be 2.
    :param memory: The memory allocated for each node. Default to be '2g'.
    :param num_nodes: The number of nodes to be used in the cluster. Default to be 1.
           For Spark local, num_nodes should always be 1 and you don't need to change it.
    :param init_ray_on_spark: Whether to launch Ray services across the cluster.
           Default to be False and in this case the Ray cluster would be launched lazily when
           Ray is involved in Project Orca.
    :param kwargs: The extra keyword arguments used for creating SparkContext and
           launching Ray if any. 

    :return: An instance of SparkContext.
    """
    print("Initializing orca context")
    import atexit
    atexit.register(stop_orca_context)
    from pyspark import SparkContext
    import warnings
    spark_args = {}
    for key in ["conf", "spark_log_level", "redirect_spark_log"]:
        if key in kwargs:
            spark_args[key] = kwargs[key]
    if cluster_mode is not None:
        cluster_mode = cluster_mode.lower()
    activate_sc = SparkContext._active_spark_context is not None
    if activate_sc:
        if cluster_mode is not None and cluster_mode != "spark-submit":
            warnings.warn(
                "Use an existing SparkContext, " +
                "cluster_mode is determined by the existing SparkContext",
                Warning)
        from bigdl.dllib.nncontext import init_nncontext
        sc = init_nncontext(conf=None,
                            spark_log_level="WARN",
                            redirect_spark_log=True)
    else:
        cluster_mode = "local" if cluster_mode is None else cluster_mode
        if cluster_mode == "local":
            if num_nodes > 1:
                warnings.warn(
                    "For Spark local mode, num_nodes should be 1, but got " +
                    repr(num_nodes) + ", ignored", Warning)
            os.environ["SPARK_DRIVER_MEMORY"] = memory
            if "python_location" in kwargs:
                spark_args["python_location"] = kwargs["python_location"]
            from bigdl.dllib.nncontext import init_spark_on_local
            sc = init_spark_on_local(cores, **spark_args)
        elif cluster_mode == "spark-submit":
            from bigdl.dllib.nncontext import init_nncontext
            sc = init_nncontext(**spark_args)
        elif cluster_mode.startswith(
                "yarn"):  # yarn, yarn-client or yarn-cluster
            hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
            if not hadoop_conf:
                assert "hadoop_conf" in kwargs,\
                    "Directory path to hadoop conf not found for yarn-client mode. Please either " \
                    "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR"
                hadoop_conf = kwargs["hadoop_conf"]
            from bigdl.dllib.utils.utils import detect_conda_env_name
            conda_env_name = detect_conda_env_name()
            for key in [
                    "driver_cores", "driver_memory",
                    "extra_executor_memory_for_ray", "extra_python_lib",
                    "penv_archive", "additional_archive", "hadoop_user_name",
                    "spark_yarn_archive", "jars"
            ]:
                if key in kwargs:
                    spark_args[key] = kwargs[key]
            from bigdl.dllib.nncontext import init_spark_on_yarn, init_spark_on_yarn_cluster
            if cluster_mode == "yarn-cluster":
                sc = init_spark_on_yarn_cluster(hadoop_conf=hadoop_conf,
                                                conda_name=conda_env_name,
                                                num_executors=num_nodes,
                                                executor_cores=cores,
                                                executor_memory=memory,
                                                **spark_args)
            else:
                sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
                                        conda_name=conda_env_name,
                                        num_executors=num_nodes,
                                        executor_cores=cores,
                                        executor_memory=memory,
                                        **spark_args)
        elif cluster_mode.startswith("k8s"):  # k8s or k8s-client
            if cluster_mode == "k8s-cluster":
                raise ValueError(
                    'For k8s-cluster mode, '
                    'please submit the application via spark-submit'
                    'and use the default cluster_mode instead')
            assert "master" in kwargs, "Please specify master for k8s-client mode"
            assert "container_image" in kwargs, "Please specify container_image for k8s-client mode"
            for key in [
                    "driver_cores", "driver_memory",
                    "extra_executor_memory_for_ray", "extra_python_lib",
                    "jars", "python_location"
            ]:
                if key in kwargs:
                    spark_args[key] = kwargs[key]
            from bigdl.dllib.nncontext import init_spark_on_k8s
            sc = init_spark_on_k8s(master=kwargs["master"],
                                   container_image=kwargs["container_image"],
                                   num_executors=num_nodes,
                                   executor_cores=cores,
                                   executor_memory=memory,
                                   **spark_args)
        elif cluster_mode == "standalone":
            for key in [
                    "driver_cores", "driver_memory",
                    "extra_executor_memory_for_ray", "extra_python_lib",
                    "jars", "master", "python_location", "enable_numa_binding"
            ]:
                if key in kwargs:
                    spark_args[key] = kwargs[key]
            from bigdl.dllib.nncontext import init_spark_standalone
            sc = init_spark_standalone(num_executors=num_nodes,
                                       executor_cores=cores,
                                       executor_memory=memory,
                                       **spark_args)
        else:
            raise ValueError(
                "cluster_mode can only be local, yarn-client, yarn-cluster,"
                "k8s-client or standalone, "
                "but got: %s".format(cluster_mode))
    ray_args = {}
    for key in [
            "redis_port", "password", "object_store_memory", "verbose", "env",
            "extra_params", "num_ray_nodes", "ray_node_cpu_cores",
            "include_webui"
    ]:
        if key in kwargs:
            ray_args[key] = kwargs[key]
    from bigdl.orca.ray import RayContext
    ray_ctx = RayContext(sc, **ray_args)
    if init_ray_on_spark:
        driver_cores = 0  # This is the default value.
        ray_ctx.init(driver_cores=driver_cores)
    return sc
Example #5
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR',
                        help='path to dataset')
    parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                        choices=model_names,
                        help='model architecture: ' +
                             ' | '.join(model_names) +
                             ' (default: resnet18)')
    parser.add_argument('--epochs', default=90, type=int, metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--max_epochs', default=90, type=int, metavar='N',
                        help='number of max epochs to run')
    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument('-b', '--batch-size', default=256, type=int,
                        metavar='N',
                        help='mini-batch size (default: 256), this is the total '
                             'batch size of all GPUs on the current node when '
                             'using Data Parallel or Distributed Data Parallel')
    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                        metavar='LR', help='initial learning rate', dest='lr')
    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                        help='momentum')
    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                        metavar='W', help='weight decay (default: 1e-4)',
                        dest='weight_decay')
    parser.add_argument('-p', '--print-freq', default=10, type=int,
                        metavar='N', help='print frequency (default: 10)')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                        help='evaluate model on validation set')
    parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                        help='use pre-trained model')
    parser.add_argument('--world-size', default=-1, type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--rank', default=-1, type=int,
                        help='node rank for distributed training')
    parser.add_argument('--seed', default=None, type=int,
                        help='seed for initializing training. ')
    parser.add_argument('--cores', default=4, type=int,
                        help='num of CPUs to use.')
    parser.add_argument('--nodes', default=1, type=int,
                        help='num of nodes to use.')
    parser.add_argument('--executor_memory', default='20g', type=str,
                        help='size of executor memory.')
    parser.add_argument('--driver_memory', default='20g', type=str,
                        help='size of driver memory.')
    parser.add_argument('--driver_cores', default=1, type=int,
                        help='num of driver cores to use.')
    args = parser.parse_args()
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=args.cores, conf={"spark.driver.memory": "20g"})
    else:
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        num_executors = args.nodes
        executor_memory = args.executor_memory
        driver_memory = args.driver_memory
        driver_cores = args.driver_cores
        num_cores_per_executor = args.cores
        os.environ['ZOO_MKL_NUMTHREADS'] = str(num_cores_per_executor)
        os.environ['OMP_NUM_THREADS'] = str(num_cores_per_executor)
        sc = init_spark_on_yarn(
            hadoop_conf=hadoop_conf_dir,
            conda_name=detect_conda_env_name(),  # auto detect current conda env name
            num_executors=num_executors,
            executor_cores=num_cores_per_executor,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            driver_cores=driver_cores,
            conf={"spark.rpc.message.maxSize": "1024",
                  "spark.task.maxFailures": "1",
                  "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"})

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True)

    model = torchvision.models.resnet50()
    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False)

    iterationPerEpoch = int(math.ceil(float(1281167) / args.batch_size))
    step = Step(iterationPerEpoch * 30, 0.1)
    zooOptimizer = SGD(args.lr, momentum=args.momentum, dampening=0.0,
                       leaningrate_schedule=step, weightdecay=args.weight_decay)
    zooModel = TorchModel.from_pytorch(model)
    criterion = torch.nn.CrossEntropyLoss()
    zooCriterion = TorchLoss.from_pytorch(criterion)
    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    train_featureSet = FeatureSet.pytorch_dataloader(train_loader)
    test_featureSet = FeatureSet.pytorch_dataloader(val_loader)
    estimator.train_minibatch(train_featureSet, zooCriterion, end_trigger=MaxEpoch(args.max_epochs),
                              checkpoint_trigger=EveryEpoch(), validation_set=test_featureSet,
                              validation_method=[Accuracy(), Top5Accuracy()])