Ejemplo n.º 1
0
def fit_models_parallel():
    '''
    Use the CDSW Workers API (via Python SDK) to launch each model fitting script in parallel

    Docs - https://docs.cloudera.com/machine-learning/cloud/distributed-computing/topics/ml-workers-api.html

    '''
    # Launch a separate worker to run each script independently

    base_path = os.getcwd()
    script_path = base_path + '/scripts'

    scripts = os.listdir(script_path)
    scripts = [
        script_path + '/' + script for script in scripts
        if script[0:3] in ['fit', 'mak']
    ]

    for script in scripts:
        cdsw.launch_workers(n=1, cpu=1, memory=3, script=script)

    # Force session to persist until each worker job has completed
    # Check for completion every minute

    complete = False

    while complete == False:

        time.sleep(60)

        workers = cdsw.list_workers()
        workers_status = [wkr['status'] for wkr in workers]

        if all(status == 'succeeded' for status in workers_status):
            complete = True
Ejemplo n.º 2
0
def run_dask_workers(n,
                     cpu,
                     memory,
                     nvidia_gpu=0,
                     scheduler_port=default_scheduler_port):
    """
  Run a CDSW worker, and run a Dask worker inside it.
  Assumes that the scheduler is running on the CDSW master.
  """
    worker_code = """
import cdsw_dask_utils
worker_proc = cdsw_dask_utils._run_dask_worker_in_worker(scheduler_port=%d)
# Keep the CDSW worker alive until the Dask worker exits.
print(worker_proc.wait())
""" % scheduler_port
    workers = cdsw.launch_workers(
      n=n, \
      cpu=cpu, \
      memory=memory, \
      nvidia_gpu=nvidia_gpu, \
      kernel="python3", \
      code=worker_code
    )
    ids = [worker['id'] for worker in workers]
    print("IDs", ids)
    # Wait for the workers to start running, but don't wait for them to exit -
    # we want them to stay up for use as daemons.
    cdsw_await_workers.await_workers(ids, wait_for_completion=False)
    return workers
Ejemplo n.º 3
0
def run_dask_workers(n,
                     cpu,
                     memory,
                     nvidia_gpu=0,
                     scheduler_port=default_scheduler_port):
    """
  Run a CDSW worker, and run a Dask worker inside it.
  Assumes that the scheduler is running on the CDSW master.
  """
    worker_code = """
import cdsw_dask_utils
worker_proc = cdsw_dask_utils._run_dask_worker_in_worker(scheduler_port=%d)
# Keep the CDSW worker alive until the Dask worker exits.
print(worker_proc.wait())
""" % scheduler_port
    workers = cdsw.launch_workers(
      n=n, \
      cpu=cpu, \
      memory=memory, \
      nvidia_gpu=nvidia_gpu, \
      kernel="python3", \
      code=worker_code
    )

    try:
        ids = [worker['id'] for worker in workers]

    except KeyError as key:
        errors = [[worker['k8sMessage'], worker['engineId']]
                  for worker in workers]
        for error in errors:
            print('''worker {} failed to launch with err message : 
              {}'''.format(error[1], error[0]))
        raise RuntimeError("failed to launch workers with err : " + error[0])

    print("IDs", ids)
    # Wait for the workers to start running, but don't wait for them to exit -
    # we want them to stay up for use as daemons.
    cdsw_await_workers.await_workers(ids, wait_for_completion=False)
    return workers
Ejemplo n.º 4
0
import cdsw

worker_code = '''
              import os 
              engine_id = os.environ.get('CDSW_ENGINE_ID')
              print('executing a whole bunch of code inside worker: {}'.format(engine_id))
              '''

workers = cdsw.launch_workers(n=2, 
                              cpu=1, 
                              memory=1, 
                              code=worker_code) 

# # Get workers ID
for worker in workers : 
  print(worker['id'])


# ### get workers information
# wait 10 secs for workers to come up
import time 
time.sleep(10)

for worker in workers : 
  import json
  print(json.dumps(worker, indent=4))
Ejemplo n.º 5
0
# master.py

import cdsw, socket

# Launch two CDSW workers. These are engines that will run in 
# the same project, execute a given code or script, and exit.
workers = cdsw.launch_workers(n=1, cpu=2, memory=4, kernel="python3",script="worker.py")

# Listen on TCP port 6000
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("0.0.0.0", 6001))
s.listen(1)

# Accept two connections, one from each worker. Workers will
# execute worker.py.
conn, addr = s.accept()
for i in range(1):
    # Receive a message from each worker and return a response.
    data = conn.recv(20)
    if not data: break
    print("Master received:", data)
    conn.send("Hello From Server!".encode())
conn.close()
Ejemplo n.º 6
0
def run_cluster(n_workers,
                n_ps,
                cpu,
                memory,
                nvidia_gpu=0,
                worker_script=None,
                timeout_seconds=60):
    try:
        os.mkdir("/home/cdsw/.tmp", mode=755)
    except:
        pass
    fname = tempfile.mkdtemp(prefix="/home/cdsw/.tmp/clusterspec")

    worker_code = tensorflow_worker_code(fname, "worker", worker_script)
    workers = cdsw.launch_workers(n_workers,
                                  cpu=cpu,
                                  memory=memory,
                                  nvidia_gpu=nvidia_gpu,
                                  code=worker_code)
    worker_ids = [worker["id"] for worker in workers]
    if n_ps > 0:
        ps_code = tensorflow_worker_code(fname, "ps", None)
        parameter_servers = cdsw.launch_workers(n_ps,
                                                cpu=cpu,
                                                memory=memory,
                                                code=ps_code)
        ps_ids = [ps["id"] for ps in parameter_servers]
    else:
        parameter_servers = []
        ps_ids = []

    # Get the IP addresses of the workers. First, wait for them all to run
    running_workers = cdsw.await_workers(worker_ids,
                                         wait_for_completion=False,
                                         timeout_seconds=timeout_seconds)
    if running_workers["failures"]:
        raise RuntimeError("Some workers failed to run")

    # Then extract the IP's from the dictionary describing them.
    worker_ips = [
        worker["ip_address"] for worker in running_workers["workers"]
    ]

    # Get the IP addresses of the parameter servers, if any
    ps_ips = []
    if n_ps > 0:
        running_ps = cdsw.await_workers(ps_ids,
                                        wait_for_completion=False,
                                        timeout_seconds=timeout_seconds)
        if running_ps["failures"]:
            raise RuntimeError("Some parameter servers failed to run")

        ps_ips = [ps["ip_address"] for ps in running_ps["workers"]]

    cspec = {
        "worker": [ip + (":%d" % tf_port) for ip in worker_ips],
        "ps": [ip + (":%d" % tf_port) for ip in ps_ips]
    }
    tmpf = fname + "/cluster.json.tmp"
    f = open(tmpf, 'w')
    f.write(json.dumps(cspec))
    f.flush()
    os.fsync(f.fileno())
    f.close()
    os.rename(tmpf, fname + "/cluster.json")

    if worker_script is not None:
        # If a script has been provided for the Tensorflow workers,
        # wait for them all to exit.
        cdsw.await_workers(worker_ids, wait_for_completion=True)
        cdsw.stop_workers(*ps_ids)
        return None, None
    else:
        # If no script has been provided, wait for the TensorFlow
        # cluster to come up, then return a handle to the lead worker
        # so the user can create a TensorFlow session.

        # Wait for workers to be up
        for ip in worker_ips:
            wait.tcp.open(tf_port, host=ip)

        for ip in ps_ips:
            wait.tcp.open(tf_port, host=ip)

        return cspec, "grpc://%s:%d" % (worker_ips[0], tf_port)
Ejemplo n.º 7
0
schedulerip = os.environ["CDSW_IP_ADDRESS"]

print(" Scheduler IP: " + schedulerip)

#Scheduler protocol and port - defaults from Dask
schproto = "tcp://"
schport = ":8786"

schloc = schproto + schedulerip + schport
print(" Scheduler URL: " + schloc)

# Launch at least one Dask Worker

dask_client = cdsw.launch_workers(n=1,
                                  cpu=4,
                                  memory=8,
                                  kernel="python3",
                                  script="daskworker.py",
                                  env={"DASKSCHURL": schloc})

time.sleep(10)

dask_client = cdsw.launch_workers(n=1,
                                  cpu=4,
                                  memory=8,
                                  kernel="python3",
                                  script="daskworker.py",
                                  env={"DASKSCHURL": schloc})

# wait for a while until the container is launched successfully
time.sleep(10)