Exemple #1
0
def _restart_debugging(interactive=True):

    global tb_pid

    #Kill existing TB
    proc = subprocess.Popen(["kill", str(tb_pid)])
    proc.wait()

    debugger_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    debugger_socket.bind(('', 0))
    debugger_addr, debugger_port = debugger_socket.getsockname()

    debugger_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''

    global pypath
    global tb_path
    global tb_port

    if interactive:
        tb_proc = subprocess.Popen([
            pypath, tb_path,
            "--logdir=%s" % logdir(),
            "--port=%d" % tb_port,
            "--debugger_port=%d" % debugger_port
        ],
                                   env=tb_env,
                                   preexec_fn=util.on_executor_exit('SIGTERM'))
        tb_pid = tb_proc.pid

    if not interactive:
        tb_proc = subprocess.Popen([
            pypath, tb_path,
            "--logdir=%s" % logdir(),
            "--port=%d" % tb_port,
            "--debugger_data_server_grpc_port=%d" % debugger_port
        ],
                                   env=tb_env,
                                   preexec_fn=util.on_executor_exit('SIGTERM'))
        tb_pid = tb_proc.pid

    time.sleep(2)

    return 'localhost:' + str(debugger_port)
Exemple #2
0
def visualize(spark_session, hdfs_root_logdir):
    """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize
    them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs.

    Args:
      :spark_session: SparkSession object
      :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard
    """

    sc = spark_session.sparkContext
    app_id = str(sc.applicationId)

    pypath = os.getenv("PYSPARK_PYTHON")

    logdir = os.getcwd() + '/tensorboard_events/'
    if os.path.exists(logdir):
        shutil.rmtree(logdir)
        os.makedirs(logdir)
    else:
        os.makedirs(logdir)

        #find free port
    tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tb_socket.bind(('', 0))
    tb_addr, tb_port = tb_socket.getsockname()

    tb_path = util.find_tensorboard()

    tb_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''

    tb_proc = subprocess.Popen(
        [pypath, tb_path,
         "--logdir=%s" % logdir,
         "--port=%d" % tb_port],
        env=tb_env,
        preexec_fn=util.on_executor_exit('SIGTERM'))

    host = socket.gethostname()
    tb_url = "http://{0}:{1}".format(host, tb_port)
    tb_endpoint = hopshdfs.get_experiments_dir(
    ) + "/" + app_id + "/TensorBoard.driver"
    #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user())

    handle = hopshdfs.get()
    hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir)
    for entry in hdfs_logdir_entries:
        file_name, extension = splitext(entry['name'])
        if not extension == '.log':
            pydoop.hdfs.get(entry['name'], logdir)

    tb_proc.wait()
    stdout, stderr = tb_proc.communicate()
    print(stdout)
    print(stderr)
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        client = coordination_server.Client(server_addr)

        node_meta = {
            'host': get_ip_address(),
            'executor_cwd': os.getcwd(),
            'cuda_visible_devices_ordinals':
            devices.get_minor_gpu_device_numbers()
        }

        client.register(node_meta)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        # Only spark executor with index 0 should create necessary HDFS directories and start mpirun
        # Other executors simply block until index 0 reports mpirun is finished

        clusterspec = client.await_reservations()

        #pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user())
        #hopshdfs.init_logger()
        #hopshdfs.log('Starting Spark executor with arguments')

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        #hopshdfs.log(gpu_str)
        print(gpu_str)

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        py_runnable = localize_scripts(nb_path, clusterspec)

        # non-chief executor should not do mpirun
        if not executor_num == 0:
            client.await_mpirun_finished()
        else:
            hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
                app_id, run_id, param_string='Horovod')
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir, hdfs_appid_logdir, 0)

            mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                      ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                      ' mpirun -np ' + str(get_num_ps(clusterspec)) + ' --hostfile ' + get_hosts_file(clusterspec) + \
                      ' -bind-to none -map-by slot ' + \
                      ' -x LD_LIBRARY_PATH ' + \
                      ' -x HOROVOD_TIMELINE ' + \
                      ' -x TENSORBOARD_LOGDIR ' + \
                      ' -x NCCL_DEBUG=INFO ' + \
                      ' -mca pml ob1 -mca btl ^openib ' + \
                      os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable

            mpi = subprocess.Popen(mpi_cmd,
                                   shell=True,
                                   stdout=mpi_logfile,
                                   stderr=mpi_logfile,
                                   preexec_fn=util.on_executor_exit('SIGTERM'))

            t_log = threading.Thread(target=print_log)
            t_log.start()

            mpi.wait()

            client.register_mpirun_finished()

            if devices.get_num_gpus() > 0:
                t_gpus.do_run = False
                t_gpus.join()

            return_code = mpi.returncode

            if return_code != 0:
                cleanup(tb_hdfs_path)
                t_log.do_run = False
                t_log.join()
                raise Exception(
                    'mpirun FAILED, look in the logs for the error')

            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
Exemple #4
0
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'horovod')

        tb_pid = 0
        tb_hdfs_path = ''

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())
        hopshdfs.init_logger()
        hopshdfs.log('Starting Spark executor with arguments')
        if executor_num == 0:
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                0,
                local_logdir=local_logdir)

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        hopshdfs.log(gpu_str)
        print(gpu_str)

        #1. Download notebook file
        fs_handle = hopshdfs.get_fs()

        try:
            fd = fs_handle.open_file(nb_path, flags='r')
        except:
            fd = fs_handle.open_file(nb_path, mode='r')

        notebook = ''
        for line in fd:
            notebook += line

        path, filename = os.path.split(nb_path)
        f_nb = open(filename, "w+")
        f_nb.write(notebook)
        f_nb.flush()
        f_nb.close()

        # 2. Convert notebook to py file
        jupyter_runnable = os.path.abspath(
            os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter'
        conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename
        conversion = subprocess.Popen(conversion_cmd,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        conversion.wait()
        stdout, stderr = conversion.communicate()
        print(stdout)
        print(stderr)

        # 3. Make py file runnable
        py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py'
        st = os.stat(py_runnable)
        os.chmod(py_runnable, st.st_mode | stat.S_IEXEC)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        # 4. Run allreduce
        mpi_np = os.environ['MPI_NP']
        mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                  ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                  ' mpirun -np ' + str(mpi_np) + \
                  ' -bind-to none -map-by slot ' + \
                  ' -x HOROVOD_TIMELINE ' + \
                  ' -x TENSORBOARD_LOGDIR ' + \
                  ' -x NCCL_DEBUG=INFO ' + \
                  os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable
        mpi = subprocess.Popen(mpi_cmd,
                               shell=True,
                               stdout=mpi_logfile,
                               stderr=mpi_logfile,
                               preexec_fn=util.on_executor_exit('SIGTERM'))

        t_log = threading.Thread(target=print_log)
        t_log.start()

        mpi.wait()

        if devices.get_num_gpus() > 0:
            t_gpus.do_run = False
            t_gpus.join()

        return_code = mpi.returncode

        if local_logdir:
            local_tb = tensorboard.local_logdir_path
            pydoop.hdfs.put(local_tb, hdfs_exec_logdir)

        if return_code != 0:
            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
            raise Exception('mpirun FAILED, look in the logs for the error')

        cleanup(tb_hdfs_path)
        t_log.do_run = False
        t_log.join()

        hopshdfs.kill_logger()
Exemple #5
0
def register(hdfs_exec_dir,
             endpoint_dir,
             exec_num,
             local_logdir=False,
             tensorboard_driver=False):

    global tb_pid

    if tb_pid != 0:
        subprocess.Popen(["kill", str(tb_pid)])

    _reset_global()

    global events_logdir
    events_logdir = hdfs_exec_dir

    global local_logdir_bool
    local_logdir_bool = local_logdir

    if tb_pid == 0:
        global pypath
        pypath = os.getenv("PYSPARK_PYTHON")

        #find free port
        tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tb_socket.bind(('', 0))
        global tb_port
        tb_addr, tb_port = tb_socket.getsockname()

        global tb_path
        tb_path = util.find_tensorboard()

        tb_socket.close()

        tb_env = os.environ.copy()
        tb_env['CUDA_VISIBLE_DEVICES'] = ''

        tb_proc = None
        global local_logdir_path
        if local_logdir:
            local_logdir_path = os.getcwd() + '/local_logdir'
            if os.path.exists(local_logdir_path):
                shutil.rmtree(local_logdir_path)
                os.makedirs(local_logdir_path)
            else:
                os.makedirs(local_logdir_path)

            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % local_logdir_path,
                    "--port=%d" % tb_port
                ],
                env=tb_env,
                preexec_fn=util.on_executor_exit('SIGTERM'))
        else:
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % events_logdir,
                    "--port=%d" % tb_port
                ],
                env=tb_env,
                preexec_fn=util.on_executor_exit('SIGTERM'))

        tb_pid = tb_proc.pid

        host = socket.gethostname()
        global tb_url
        tb_url = "http://{0}:{1}".format(host, tb_port)
        global endpoint
        if tensorboard_driver:
            endpoint = endpoint_dir + "/TensorBoard.driver"
        else:
            endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num)

        #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user())

    return endpoint, tb_pid