Beispiel #1
0
def execute(config, docker_registry, base_image, namespace=None):
    """
    Runs the LightGBM CLI in a single pod in user's Kubeflow cluster.
    Users can configure it to be a train, predict, and other supported tasks
    by using the right config.
    Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
    for more information on config options.
    Attributes:
        config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
        docker_registry: registry to push the built docker image
        base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable.
        namespace: Kubernetes namespace to use
    """
    if namespace is None:
        namespace = "kubeflow"

    config_file_name = None
    if isinstance(config, str):
        config_file_name = config
        config = _load_config_file(config)
    elif isinstance(config, dict):
        config_file_name = _save_to_config_file(config)
    else:
        raise RuntimeError("config should be of type dict or string(filepath) "
                           "but got {}".format(type(dict)))

    output_map = generate_context_files(config, config_file_name)

    preprocessor = BasePreProcessor(
        command=[ENTRYPOINT], output_map=output_map)
    builder = AppendBuilder(registry=docker_registry,
                            base_image=base_image, preprocessor=preprocessor)
    builder.build()
    pod_spec = builder.generate_pod_spec()
    deployer = Job(namespace=namespace, pod_spec_mutators=[
                   fairing.cloud.gcp.add_gcp_credentials_if_exists])
    deployer.deploy(pod_spec)
Beispiel #2
0
def execute(config,
            docker_registry,
            base_image="gcr.io/kubeflow-fairing/lightgbm:latest",
            namespace="kubeflow",
            stream_log=True,
            cores_per_worker=None,
            memory_per_worker=None,
            pod_spec_mutators=None):
    """
    Runs the LightGBM CLI in a single pod in user's Kubeflow cluster.
    Users can configure it to be a train, predict, and other supported tasks
    by using the right config.
    Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
    for more information on config options.
    Attributes:
        config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
        docker_registry: registry to push the built docker image
        base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable.
        namespace: Kubernetes namespace to use
        stream_log: True - streams logs from the first worker in the training job after job launch till the training is finished.
                    Flase - no logs are streamed after the job launch. An async job launch use case.
        cores_per_worker: #cpu cores allocated per worker
        memory_per_worker: memory allocated per worker in GB, it can be fractional.
        pod_spec_mutators: list of functions that is used to mutate the podsspec. e.g. fairing.cloud.gcp.add_gcp_credentials_if_exists
                           This can used to set things like volumes and security context.
    """

    config_file_name = None
    if isinstance(config, str):
        config_file_name = config
        config = utils.load_properties_config_file(config)
    elif isinstance(config, dict):
        config_file_name = utils.save_properties_config_file(config)
    else:
        raise RuntimeError("config should be of type dict or string(filepath) "
                           "but got {}".format(type(dict)))

    utils.scrub_fields(config, BLACKLISTED_FIELDS)

    _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS)
    num_machines = num_machines or 1
    if num_machines:
        try:
            num_machines = int(num_machines)
        except ValueError:
            raise ValueError(
                "num_machines value in config should be an int >= 1 "
                "but got {}".format(config.get('num_machines')))
        if num_machines < 1:
            raise ValueError(
                "num_machines value in config should >= 1 but got {}".format(
                    num_machines))

    if num_machines > 1:
        config['machine_list_file'] = "mlist.txt"
    output_map = generate_context_files(config, config_file_name,
                                        num_machines > 1)

    preprocessor = BasePreProcessor(command=[ENTRYPOINT],
                                    output_map=output_map)
    builder = AppendBuilder(registry=docker_registry,
                            base_image=base_image,
                            preprocessor=preprocessor)
    builder.build()
    pod_spec = builder.generate_pod_spec()

    pod_spec_mutators = pod_spec_mutators or []
    pod_spec_mutators.append(fairing.cloud.gcp.add_gcp_credentials_if_exists)
    pod_spec_mutators.append(
        k8s_utils.get_resource_mutator(cores_per_worker, memory_per_worker))

    if num_machines == 1:
        # non-distributed mode
        deployer = Job(namespace=namespace,
                       pod_spec_mutators=pod_spec_mutators,
                       stream_log=stream_log)
    else:
        # distributed mode
        deployer = TfJob(namespace=namespace,
                         pod_spec_mutators=pod_spec_mutators,
                         chief_count=1,
                         worker_count=num_machines - 1,
                         stream_log=stream_log)
    deployer.deploy(pod_spec)
    return deployer
Beispiel #3
0
def execute(config,
            docker_registry,
            base_image="gcr.io/kubeflow-fairing/lightgbm:latest",
            namespace="kubeflow"):
    """
    Runs the LightGBM CLI in a single pod in user's Kubeflow cluster.
    Users can configure it to be a train, predict, and other supported tasks
    by using the right config.
    Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
    for more information on config options.
    Attributes:
        config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
        docker_registry: registry to push the built docker image
        base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable.
        namespace: Kubernetes namespace to use
    """

    config_file_name = None
    if isinstance(config, str):
        config_file_name = config
        config = utils.load_properties_config_file(config)
    elif isinstance(config, dict):
        config_file_name = utils.save_properties_config_file(config)
    else:
        raise RuntimeError("config should be of type dict or string(filepath) "
                           "but got {}".format(type(dict)))

    utils.scrub_fields(config, BLACKLISTED_FIELDS)

    _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS)
    num_machines = num_machines or 1
    if num_machines:
        try:
            num_machines = int(num_machines)
        except ValueError:
            raise ValueError(
                "num_machines value in config should be an int >= 1 "
                "but got {}".format(config.get('num_machines')))
        if num_machines < 1:
            raise ValueError(
                "num_machines value in config should >= 1 but got {}".format(
                    num_machines))

    if num_machines > 1:
        config['machine_list_file'] = "mlist.txt"
    output_map = generate_context_files(config, config_file_name,
                                        num_machines > 1)

    preprocessor = BasePreProcessor(command=[ENTRYPOINT],
                                    output_map=output_map)
    builder = AppendBuilder(registry=docker_registry,
                            base_image=base_image,
                            preprocessor=preprocessor)
    builder.build()
    pod_spec = builder.generate_pod_spec()

    if num_machines == 1:
        # non-distributed mode
        deployer = Job(namespace=namespace,
                       pod_spec_mutators=[
                           fairing.cloud.gcp.add_gcp_credentials_if_exists
                       ])
    else:
        # distributed mode
        deployer = TfJob(namespace=namespace,
                         pod_spec_mutators=[
                             fairing.cloud.gcp.add_gcp_credentials_if_exists
                         ],
                         chief_count=1,
                         worker_count=num_machines - 1)
    deployer.deploy(pod_spec)