Ejemplo n.º 1
0
def test_raylet_tempfiles():
    ray.init(redirect_worker_output=False)
    top_levels = set(os.listdir(tempfile_services.get_temp_root()))
    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
    log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
    assert log_files == {
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
        "monitor.err", "raylet_monitor.out", "raylet_monitor.err",
        "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err",
        "raylet.out", "raylet.err"
    }  # with raylet logs
    socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
    assert socket_files == {"plasma_store", "raylet"}
    ray.shutdown()

    ray.init(redirect_worker_output=True, num_cpus=0)
    top_levels = set(os.listdir(tempfile_services.get_temp_root()))
    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
    log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
    assert log_files == {
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
        "monitor.err", "raylet_monitor.out", "raylet_monitor.err",
        "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err",
        "raylet.out", "raylet.err"
    }  # with raylet logs
    socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
    assert socket_files == {"plasma_store", "raylet"}
    ray.shutdown()

    ray.init(redirect_worker_output=True, num_cpus=2)
    top_levels = set(os.listdir(tempfile_services.get_temp_root()))
    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
    time.sleep(3)  # wait workers to start
    log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
    assert log_files.issuperset({
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
        "monitor.err", "raylet_monitor.out", "raylet_monitor.err",
        "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err",
        "raylet.out", "raylet.err"
    })  # with raylet logs

    # Check numbers of worker log file.
    assert sum(1 for filename in log_files
               if filename.startswith("worker")) == 4

    socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
    assert socket_files == {"plasma_store", "raylet"}
    ray.shutdown()
Ejemplo n.º 2
0
def test_raylet_tempfiles():
    ray.init(redirect_worker_output=False)
    top_levels = set(os.listdir(tempfile_services.get_temp_root()))
    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
    log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
    assert log_files == {
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
        "monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
        "redis.err"
    }  # without raylet logs
    socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
    assert socket_files == {"plasma_store", "raylet"}
    ray.shutdown()

    ray.init(redirect_worker_output=True, num_cpus=0)
    top_levels = set(os.listdir(tempfile_services.get_temp_root()))
    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
    log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
    assert log_files == {
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
        "monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
        "redis.err", "raylet.out", "raylet.err"
    }  # with raylet logs
    socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
    assert socket_files == {"plasma_store", "raylet"}
    ray.shutdown()

    ray.init(redirect_worker_output=True, num_cpus=2)
    top_levels = set(os.listdir(tempfile_services.get_temp_root()))
    assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
    time.sleep(3)  # wait workers to start
    log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
    assert log_files.issuperset({
        "log_monitor.out", "log_monitor.err", "plasma_store.out",
        "plasma_store.err", "webui.out", "webui.err", "monitor.out",
        "monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
        "redis.err", "raylet.out", "raylet.err"
    })  # with raylet logs

    # Check numbers of worker log file.
    assert sum(
        1 for filename in log_files if filename.startswith("worker")) == 4

    socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
    assert socket_files == {"plasma_store", "raylet"}
    ray.shutdown()
Ejemplo n.º 3
0
def start_worker(node_ip_address,
                 object_store_name,
                 local_scheduler_name,
                 redis_address,
                 worker_path,
                 stdout_file=None,
                 stderr_file=None):
    """This method starts a worker process.

    Args:
        node_ip_address (str): The IP address of the node that this worker is
            running on.
        object_store_name (str): The name of the object store.
        local_scheduler_name (str): The name of the local scheduler.
        redis_address (str): The address that the Redis server is listening on.
        worker_path (str): The path of the source code which the worker process
            will run.
        stdout_file: A file handle opened for writing to redirect stdout to. If
            no redirection should happen, then this should be None.
        stderr_file: A file handle opened for writing to redirect stderr to. If
            no redirection should happen, then this should be None.

    Returns:
        The process that was started.
    """
    command = [
        sys.executable, "-u", worker_path,
        "--node-ip-address=" + node_ip_address,
        "--object-store-name=" + object_store_name,
        "--redis-address=" + str(redis_address),
        "--temp-dir=" + get_temp_root()
    ]
    p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
    record_log_files_in_redis(redis_address, node_ip_address,
                              [stdout_file, stderr_file])
    return p
Ejemplo n.º 4
0
def start_raylet(redis_address,
                 node_ip_address,
                 raylet_name,
                 plasma_store_name,
                 worker_path,
                 num_cpus=None,
                 num_gpus=None,
                 resources=None,
                 object_manager_port=None,
                 node_manager_port=None,
                 redis_password=None,
                 use_valgrind=False,
                 use_profiler=False,
                 stdout_file=None,
                 stderr_file=None,
                 config=None):
    """Start a raylet, which is a combined local scheduler and object manager.

    Args:
        redis_address (str): The address of the primary Redis server.
        node_ip_address (str): The IP address of this node.
        raylet_name (str): The name of the raylet socket to create.
        plasma_store_name (str): The name of the plasma store socket to connect
             to.
        worker_path (str): The path of the Python file that new worker
            processes will execute.
        num_cpus: The CPUs allocated for this raylet.
        num_gpus: The GPUs allocated for this raylet.
        resources: The custom resources allocated for this raylet.
        object_manager_port: The port to use for the object manager. If this is
            None, then the object manager will choose its own port.
        node_manager_port: The port to use for the node manager. If this is
            None, then the node manager will choose its own port.
        redis_password: The password to use when connecting to Redis.
        use_valgrind (bool): True if the raylet should be started inside
            of valgrind. If this is True, use_profiler must be False.
        use_profiler (bool): True if the raylet should be started inside
            a profiler. If this is True, use_valgrind must be False.
        stdout_file: A file handle opened for writing to redirect stdout to. If
            no redirection should happen, then this should be None.
        stderr_file: A file handle opened for writing to redirect stderr to. If
            no redirection should happen, then this should be None.
        config (dict|None): Optional Raylet configuration that will
            override defaults in RayConfig.

    Returns:
        The process that was started.
    """
    config = config or {}
    config_str = ",".join(["{},{}".format(*kv) for kv in config.items()])

    if use_valgrind and use_profiler:
        raise Exception("Cannot use valgrind and profiler at the same time.")

    num_initial_workers = (num_cpus if num_cpus is not None else
                           multiprocessing.cpu_count())

    static_resources = check_and_update_resources(num_cpus, num_gpus,
                                                  resources)

    # Limit the number of workers that can be started in parallel by the
    # raylet. However, make sure it is at least 1.
    maximum_startup_concurrency = max(
        1, min(multiprocessing.cpu_count(), static_resources["CPU"]))

    # Format the resource argument in a form like 'CPU,1.0,GPU,0,Custom,3'.
    resource_argument = ",".join(
        ["{},{}".format(*kv) for kv in static_resources.items()])

    gcs_ip_address, gcs_port = redis_address.split(":")

    # Create the command that the Raylet will use to start workers.
    start_worker_command = ("{} {} "
                            "--node-ip-address={} "
                            "--object-store-name={} "
                            "--raylet-name={} "
                            "--redis-address={} "
                            "--temp-dir={}".format(sys.executable, worker_path,
                                                   node_ip_address,
                                                   plasma_store_name,
                                                   raylet_name, redis_address,
                                                   get_temp_root()))
    if redis_password:
        start_worker_command += " --redis-password {}".format(redis_password)

    # If the object manager port is None, then use 0 to cause the object
    # manager to choose its own port.
    if object_manager_port is None:
        object_manager_port = 0
    # If the node manager port is None, then use 0 to cause the node manager
    # to choose its own port.
    if node_manager_port is None:
        node_manager_port = 0

    command = [
        RAYLET_EXECUTABLE,
        raylet_name,
        plasma_store_name,
        str(object_manager_port),
        str(node_manager_port),
        node_ip_address,
        gcs_ip_address,
        gcs_port,
        str(num_initial_workers),
        str(maximum_startup_concurrency),
        resource_argument,
        config_str,
        start_worker_command,
        "",  # Worker command for Java, not needed for Python.
        redis_password or "",
        get_temp_root(),
    ]

    if use_valgrind:
        p = subprocess.Popen([
            "valgrind", "--track-origins=yes", "--leak-check=full",
            "--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
            "--error-exitcode=1"
        ] + command,
                             stdout=stdout_file,
                             stderr=stderr_file)
    elif use_profiler:
        p = subprocess.Popen(["valgrind", "--tool=callgrind"] + command,
                             stdout=stdout_file,
                             stderr=stderr_file)
    elif "RAYLET_PERFTOOLS_PATH" in os.environ:
        modified_env = os.environ.copy()
        modified_env["LD_PRELOAD"] = os.environ["RAYLET_PERFTOOLS_PATH"]
        modified_env["CPUPROFILE"] = os.environ["RAYLET_PERFTOOLS_LOGFILE"]
        p = subprocess.Popen(command,
                             stdout=stdout_file,
                             stderr=stderr_file,
                             env=modified_env)
    else:
        p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)

    record_log_files_in_redis(redis_address,
                              node_ip_address, [stdout_file, stderr_file],
                              password=redis_password)

    return p
Ejemplo n.º 5
0
def start_local_scheduler(plasma_store_name,
                          plasma_manager_name=None,
                          worker_path=None,
                          plasma_address=None,
                          node_ip_address="127.0.0.1",
                          redis_address=None,
                          use_valgrind=False,
                          use_profiler=False,
                          stdout_file=None,
                          stderr_file=None,
                          static_resources=None,
                          num_workers=0):
    """Start a local scheduler process.

    Args:
        plasma_store_name (str): The name of the plasma store socket to connect
            to.
        plasma_manager_name (str): The name of the plasma manager to connect
            to. This does not need to be provided, but if it is, then the Redis
            address must be provided as well.
        worker_path (str): The path of the worker script to use when the local
            scheduler starts up new workers.
        plasma_address (str): The address of the plasma manager to connect to.
            This is only used by the global scheduler to figure out which
            plasma managers are connected to which local schedulers.
        node_ip_address (str): The address of the node that this local
            scheduler is running on.
        redis_address (str): The address of the Redis instance to connect to.
            If this is not provided, then the local scheduler will not connect
            to Redis.
        use_valgrind (bool): True if the local scheduler should be started
            inside of valgrind. If this is True, use_profiler must be False.
        use_profiler (bool): True if the local scheduler should be started
            inside a profiler. If this is True, use_valgrind must be False.
        stdout_file: A file handle opened for writing to redirect stdout to. If
            no redirection should happen, then this should be None.
        stderr_file: A file handle opened for writing to redirect stderr to. If
            no redirection should happen, then this should be None.
        static_resources: A dictionary specifying the local scheduler's
            resource capacities. This maps resource names (strings) to
            integers or floats.
        num_workers (int): The number of workers that the local scheduler
            should start.

    Return:
        A tuple of the name of the local scheduler socket and the process ID of
            the local scheduler process.
    """
    if (plasma_manager_name is None) != (redis_address is None):
        raise Exception("If one of the plasma_manager_name and the "
                        "redis_address is provided, then both must be "
                        "provided.")
    if use_valgrind and use_profiler:
        raise Exception("Cannot use valgrind and profiler at the same time.")
    local_scheduler_executable = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "../core/src/local_scheduler/local_scheduler")
    local_scheduler_name = get_local_scheduler_socket_name()
    command = [
        local_scheduler_executable, "-s", local_scheduler_name, "-p",
        plasma_store_name, "-h", node_ip_address, "-n",
        str(num_workers)
    ]
    if plasma_manager_name is not None:
        command += ["-m", plasma_manager_name]
    if worker_path is not None:
        assert plasma_store_name is not None
        assert plasma_manager_name is not None
        assert redis_address is not None
        start_worker_command = ("{} {} "
                                "--node-ip-address={} "
                                "--object-store-name={} "
                                "--object-store-manager-name={} "
                                "--local-scheduler-name={} "
                                "--redis-address={} "
                                "--temp-dir={}".format(
                                    sys.executable, worker_path,
                                    node_ip_address, plasma_store_name,
                                    plasma_manager_name, local_scheduler_name,
                                    redis_address, get_temp_root()))
        command += ["-w", start_worker_command]
    if redis_address is not None:
        command += ["-r", redis_address]
    if plasma_address is not None:
        command += ["-a", plasma_address]
    if static_resources is not None:
        resource_argument = ""
        for resource_name, resource_quantity in static_resources.items():
            assert (isinstance(resource_quantity, int)
                    or isinstance(resource_quantity, float))
        resource_argument = ",".join([
            resource_name + "," + str(resource_quantity)
            for resource_name, resource_quantity in static_resources.items()
        ])
    else:
        resource_argument = "CPU,{}".format(multiprocessing.cpu_count())
    command += ["-c", resource_argument]

    if use_valgrind:
        pid = subprocess.Popen([
            "valgrind", "--track-origins=yes", "--leak-check=full",
            "--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
            "--error-exitcode=1"
        ] + command,
                               stdout=stdout_file,
                               stderr=stderr_file)
        time.sleep(1.0)
    elif use_profiler:
        pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command,
                               stdout=stdout_file,
                               stderr=stderr_file)
        time.sleep(1.0)
    else:
        pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
        time.sleep(0.1)
    return local_scheduler_name, pid