Esempio n. 1
0
def visualize(hdfs_root_logdir):
    """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize
    them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs.

    Args:
      :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard
    """

    sc = util._find_spark().sparkContext
    app_id = str(sc.applicationId)

    pypath = os.getenv("PYSPARK_PYTHON")

    logdir = os.getcwd() + '/tensorboard_events/'
    if os.path.exists(logdir):
        shutil.rmtree(logdir)
        os.makedirs(logdir)
    else:
        os.makedirs(logdir)

        #find free port
    tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tb_socket.bind(('', 0))
    tb_addr, tb_port = tb_socket.getsockname()

    tb_path = util._find_tensorboard()

    tb_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''
    tb_env['LC_ALL'] = 'C'

    tb_proc = subprocess.Popen([
        pypath, tb_path,
        "--logdir=%s" % logdir,
        "--port=%d" % tb_port,
        "--host=%s" % "0.0.0.0"
    ],
                               env=tb_env,
                               preexec_fn=util._on_executor_exit('SIGTERM'))

    host = socket.gethostname()
    tb_url = "http://{0}:{1}".format(host, tb_port)
    tb_endpoint = hopshdfs._get_experiments_dir(
    ) + "/" + app_id + "/TensorBoard.visualize"
    #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user())

    handle = hopshdfs.get()
    hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir)
    for entry in hdfs_logdir_entries:
        file_name, extension = splitext(entry['name'])
        if not extension == '.log':
            pydoop.hdfs.get(entry['name'], logdir)

    tb_proc.wait()
    stdout, stderr = tb_proc.communicate()
    print(stdout)
    print(stderr)
Esempio n. 2
0
def get_portable_runner_config(sdk_worker_parallelism=1,
                               worker_threads=100,
                               pre_optimize="all",
                               execution_mode_for_batch="BATCH_FORCED"):
    """
    Instantiate a list of pipeline configuration options for the PortableRunner.

    Args:
        sdk_worker_parallelism: sdk_worker_parallelism
        worker_threads: worker_threads
        pre_optimize: pre_optimize
        execution_mode_for_batch: execution_mode_for_batch

    Returns:
        a list of pipeline configuration options for the PortableRunner.
    """
    return [
        '--runner=PortableRunner',
        '--hdfs_host=' + str(hopsfs.get_webhdfs_host()), '--hdfs_port=' +
        str(hopsfs.get_webhdfs_port()), '--hdfs_user='******'--job_endpoint=' + jobserver_host + ":" + str(jobserver_port),
        '--environment_type=PROCESS',
        '--environment_config=' + '{"command":"' + get_sdk_worker() + '"}',
        '--sdk_worker_parallelism=' + str(sdk_worker_parallelism),
        '--experiments=worker_threads' + str(worker_threads),
        '--experiments=pre_optimize=' + pre_optimize,
        '--execution_mode_for_batch=' + execution_mode_for_batch
    ]
Esempio n. 3
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            #Arguments
            if args_dict:
                argcount = six.get_function_code(map_fun).co_argcount
                names = six.get_function_code(map_fun).co_varnames

                args = []
                argIndex = 0
                param_string = ''
                while argcount > 0:
                    #Get args for executor and run function
                    param_name = names[argIndex]
                    param_val = args_dict[param_name][executor_num]
                    param_string += str(param_name) + '=' + str(
                        param_val) + '.'
                    args.append(param_val)
                    argcount -= 1
                    argIndex += 1
                param_string = param_string[:-1]
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, param_string, 'launcher')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)

                gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string + '\n')
                hopshdfs.log('Started running task ' + param_string)
                task_start = datetime.datetime.now()
                map_fun(*args)
                task_end = datetime.datetime.now()
                time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                print(
                    '-------------------------------------------------------')
                hopshdfs.log(time_str)
            else:
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'launcher')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
                gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task\n')
                hopshdfs.log('Started running task')
                task_start = datetime.datetime.now()
                retval = map_fun()
                task_end = datetime.datetime.now()
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)
                time_str = 'Finished task - took ' + util._time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                print(
                    '-------------------------------------------------------')
                hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            try:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
Esempio n. 4
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        task_index = None

        try:
            host = util._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]

            client = allreduce_reservation.Client(server_addr)
            host_port = host + ":" + str(port)

            client.register({"worker": host_port, "index": executor_num})
            cluster = client.await_reservations()
            tmp_socket.close()
            client.close()

            task_index = _find_index(host_port, cluster)

            cluster["task"] = {"type": "worker", "index": task_index}

            os.environ["TF_CONFIG"] = json.dumps(cluster)

            if task_index == 0:
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'collective_all_reduce')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            if task_index == 0:
                hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task \n')
            if task_index == 0:
                hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()

            retval = map_fun()
            if task_index == 0:
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)
            task_end = datetime.datetime.now()
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            if task_index == 0:
                hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if task_index == 0:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        global local_logdir_bool

        try:
            #Arguments
            if args_dict:
                argcount = six.get_function_code(map_fun).co_argcount
                names = six.get_function_code(map_fun).co_varnames

                args = []
                argIndex = 0
                param_string = ''
                while argcount > 0:
                    #Get args for executor and run function
                    param_name = names[argIndex]
                    param_val = args_dict[param_name][executor_num]
                    param_string += str(param_name) + '=' + str(
                        param_val) + '.'
                    args.append(param_val)
                    argcount -= 1
                    argIndex += 1
                param_string = param_string[:-1]

                val = _get_metric(param_string, app_id, generation_id, run_id)
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
                    app_id,
                    run_id,
                    param_string,
                    'differential_evolution',
                    sub_type='generation.' + str(generation_id))
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs.init_logger()
                tb_hdfs_path, tb_pid = tensorboard.register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir_bool)
                gpu_str = '\nChecking for GPUs in the environment' + devices.get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string + '\n')
                if val:
                    print('Reading returned metric from previous run: ' +
                          str(val))
                hopshdfs.log('Started running task ' + param_string)
                task_start = datetime.datetime.now()
                if not val:
                    val = map_fun(*args)
                task_end = datetime.datetime.now()
                time_str = 'Finished task ' + param_string + ' - took ' + util.time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                hopshdfs.log(time_str)
                try:
                    castval = int(val)
                except:
                    raise ValueError(
                        'Your function needs to return a metric (number) which should be maximized or minimized'
                    )

                metric_file = hdfs_exec_logdir + '/metric'
                fs_handle = hopshdfs.get_fs()
                try:
                    fd = fs_handle.open_file(metric_file, mode='w')
                except:
                    fd = fs_handle.open_file(metric_file, flags='w')

                fd.write(str(float(val)).encode())
                fd.flush()
                fd.close()
                print('Returning metric ' + str(val))
                print(
                    '-------------------------------------------------------')
        except:
            #Always do cleanup
            if tb_hdfs_path:
                _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if local_logdir_bool:
                local_tb = tensorboard.local_logdir_path
                util.store_local_tensorboard(local_tb, hdfs_exec_logdir)

        hopshdfs.log('Finished running')
        if tb_hdfs_path:
            _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
Esempio n. 6
0
def _register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False):
    """

    Args:
        hdfs_exec_dir:
        endpoint_dir:
        exec_num:
        local_logdir:

    Returns:

    """
    global tb_pid

    if tb_pid != 0:
        subprocess.Popen(["kill", str(tb_pid)])

    _reset_global()

    global events_logdir
    events_logdir = hdfs_exec_dir

    global local_logdir_bool
    local_logdir_bool = local_logdir

    if tb_pid == 0:
        global pypath
        pypath = os.getenv("PYSPARK_PYTHON")

        #find free port
        tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tb_socket.bind(('', 0))
        global tb_port
        tb_addr, tb_port = tb_socket.getsockname()

        global tb_path
        tb_path = experiment_utils._find_tensorboard()

        tb_socket.close()

        tb_env = _init_tb_env()

        global local_logdir_path
        if local_logdir:
            local_logdir_path = os.getcwd() + '/local_logdir'
            if os.path.exists(local_logdir_path):
                shutil.rmtree(local_logdir_path)
                os.makedirs(local_logdir_path)
            else:
                os.makedirs(local_logdir_path)

            local_logdir_path = local_logdir_path + '/'
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % local_logdir_path,
                    "--port=%d" % tb_port,
                    "--host=%s" % "0.0.0.0"
                ],
                env=tb_env,
                preexec_fn=util._on_executor_exit('SIGTERM'))
        else:
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % events_logdir,
                    "--port=%d" % tb_port,
                    "--host=%s" % "0.0.0.0"
                ],
                env=tb_env,
                preexec_fn=util._on_executor_exit('SIGTERM'))

        tb_pid = tb_proc.pid

        host = socket.gethostname()
        global tb_url
        tb_url = "http://{0}:{1}".format(host, tb_port)
        global endpoint
        endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num)

        #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user())

    return endpoint, tb_pid
Esempio n. 7
0
def begin(name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    Start a custom Experiment, at the end of the experiment call *end(metric)*.

    *IMPORTANT* - This call should not be combined with other functions in the experiment module, other than *end*.
    Other experiment functions such as *grid_search* manages the *begin* and *end* functions internally

    Example usage:

    >>> from hops import experiment
    >>> experiment.begin(name='calculate pi')
    >>> # Code to calculate pi
    >>> pi = calc_pi()
    >>> experiment.end(pi)

    Args:
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: A longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """
    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.stop() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        global run_id
        global driver_tensorboard_hdfs_path

        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        run_id = run_id + 1

        versioned_path = util._version_resources(versioned_resources, _get_logdir(app_id))

        experiment_json = None

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'begin', _get_logdir(app_id), None, versioned_path, description)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(app_id, run_id, None, 'begin')

        pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user())

        hopshdfs._init_logger()

        driver_tensorboard_hdfs_path,_ = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir)
    except:
        _exception_handler()
        raise

    return driver_tensorboard_hdfs_path
Esempio n. 8
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        role = None

        client = parameter_server_reservation.Client(server_addr)

        try:
            host = util._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]
            host_port = host + ":" + str(port)

            exec_spec = {}
            if executor_num < num_ps:
                exec_spec["task_type"] = "ps"
            else:
                exec_spec["task_type"] = "worker"
            exec_spec["host_port"] = host_port
            exec_spec["gpus_present"] = devices.get_num_gpus() > 0

            client.register(exec_spec)

            cluster = client.await_reservations()

            tmp_socket.close()

            role, index = _find_task_and_index(host_port, cluster)

            cluster_spec = {}
            cluster_spec["cluster"] = cluster
            cluster_spec["task"] = {"type": role, "index": index}

            print(cluster_spec)

            os.environ["TF_CONFIG"] = json.dumps(cluster_spec)

            if role == "chief":
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'parameter_server')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            if role == "chief":
                hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task \n')
            if role == "chief":
                hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()

            retval = None
            if role == "ps":
                ps_thread = threading.Thread(target=lambda: map_fun())
                ps_thread.start()
                print("waiting for workers")
                client.await_all_workers_finished()
                print("waiting finished")
            else:
                retval = map_fun()

            if role == "chief":
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)

            task_end = datetime.datetime.now()
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            if role == "chief":
                hopshdfs.log(time_str)
        except:
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if role == "chief":
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            try:
                if role == "worker" or role == "chief":
                    client.register_worker_finished()
                client.close()
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
Esempio n. 9
0
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'horovod')

        tb_pid = 0
        tb_hdfs_path = ''

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())
        hopshdfs.init_logger()
        hopshdfs.log('Starting Spark executor with arguments')
        if executor_num == 0:
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                0,
                local_logdir=local_logdir)

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        hopshdfs.log(gpu_str)
        print(gpu_str)

        #1. Download notebook file
        fs_handle = hopshdfs.get_fs()

        try:
            fd = fs_handle.open_file(nb_path, flags='r')
        except:
            fd = fs_handle.open_file(nb_path, mode='r')

        notebook = ''
        for line in fd:
            notebook += line

        path, filename = os.path.split(nb_path)
        f_nb = open(filename, "w+")
        f_nb.write(notebook)
        f_nb.flush()
        f_nb.close()

        # 2. Convert notebook to py file
        jupyter_runnable = os.path.abspath(
            os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter'
        conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename
        conversion = subprocess.Popen(conversion_cmd,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        conversion.wait()
        stdout, stderr = conversion.communicate()
        print(stdout)
        print(stderr)

        # 3. Make py file runnable
        py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py'
        st = os.stat(py_runnable)
        os.chmod(py_runnable, st.st_mode | stat.S_IEXEC)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        # 4. Run allreduce
        mpi_np = os.environ['MPI_NP']
        mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                  ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                  ' mpirun -np ' + str(mpi_np) + \
                  ' -bind-to none -map-by slot ' + \
                  ' -x HOROVOD_TIMELINE ' + \
                  ' -x TENSORBOARD_LOGDIR ' + \
                  ' -x NCCL_DEBUG=INFO ' + \
                  os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable
        mpi = subprocess.Popen(mpi_cmd,
                               shell=True,
                               stdout=mpi_logfile,
                               stderr=mpi_logfile,
                               preexec_fn=util.on_executor_exit('SIGTERM'))

        t_log = threading.Thread(target=print_log)
        t_log.start()

        mpi.wait()

        if devices.get_num_gpus() > 0:
            t_gpus.do_run = False
            t_gpus.join()

        return_code = mpi.returncode

        if local_logdir:
            local_tb = tensorboard.local_logdir_path
            pydoop.hdfs.put(local_tb, hdfs_exec_logdir)

        if return_code != 0:
            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
            raise Exception('mpirun FAILED, look in the logs for the error')

        cleanup(tb_hdfs_path)
        t_log.do_run = False
        t_log.join()

        hopshdfs.kill_logger()
Esempio n. 10
0
def begin(spark,
          name='no-name',
          local_logdir=False,
          versioned_resources=None,
          description=None):
    """ Start an experiment

    Args:
      :spark_session: SparkSession object
      :name: (optional) name of the job
    """
    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.stop() to stop it."
        )

    try:
        global app_id
        global experiment_json
        global elastic_id
        global run_id
        global driver_tensorboard_hdfs_path

        running = True

        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        run_id = run_id + 1

        versioned_path = util.version_resources(versioned_resources,
                                                get_logdir(app_id))

        experiment_json = None

        experiment_json = util.populate_experiment(sc, name,
                                                   'experiment', 'begin',
                                                   get_logdir(app_id), None,
                                                   versioned_path, description)

        util.version_resources(versioned_resources, get_logdir(app_id))

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'begin')

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())

        hopshdfs.init_logger()

        driver_tensorboard_hdfs_path, _ = tensorboard.register(
            hdfs_exec_logdir,
            hdfs_appid_logdir,
            0,
            local_logdir=local_logdir,
            tensorboard_driver=True)
    except:
        exception_handler()
        raise

    return
Esempio n. 11
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                app_id, run_id, None, 'mirrored')
            pydoop.hdfs.dump('',
                             os.environ['EXEC_LOGFILE'],
                             user=hopshdfs.project_user())
            hopshdfs._init_logger()
            tb_hdfs_path, tb_pid = tensorboard._register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                executor_num,
                local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task\n')
            hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()
            retval = map_fun()
            task_end = datetime.datetime.now()
            if retval:
                _handle_return(retval, hdfs_exec_logdir)
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            try:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
Esempio n. 12
0
def register(hdfs_exec_dir,
             endpoint_dir,
             exec_num,
             local_logdir=False,
             tensorboard_driver=False):

    global tb_pid

    if tb_pid != 0:
        subprocess.Popen(["kill", str(tb_pid)])

    _reset_global()

    global events_logdir
    events_logdir = hdfs_exec_dir

    global local_logdir_bool
    local_logdir_bool = local_logdir

    if tb_pid == 0:
        global pypath
        pypath = os.getenv("PYSPARK_PYTHON")

        #find free port
        tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tb_socket.bind(('', 0))
        global tb_port
        tb_addr, tb_port = tb_socket.getsockname()

        global tb_path
        tb_path = util.find_tensorboard()

        tb_socket.close()

        tb_env = os.environ.copy()
        tb_env['CUDA_VISIBLE_DEVICES'] = ''

        tb_proc = None
        global local_logdir_path
        if local_logdir:
            local_logdir_path = os.getcwd() + '/local_logdir'
            if os.path.exists(local_logdir_path):
                shutil.rmtree(local_logdir_path)
                os.makedirs(local_logdir_path)
            else:
                os.makedirs(local_logdir_path)

            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % local_logdir_path,
                    "--port=%d" % tb_port
                ],
                env=tb_env,
                preexec_fn=util.on_executor_exit('SIGTERM'))
        else:
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % events_logdir,
                    "--port=%d" % tb_port
                ],
                env=tb_env,
                preexec_fn=util.on_executor_exit('SIGTERM'))

        tb_pid = tb_proc.pid

        host = socket.gethostname()
        global tb_url
        tb_url = "http://{0}:{1}".format(host, tb_port)
        global endpoint
        if tensorboard_driver:
            endpoint = endpoint_dir + "/TensorBoard.driver"
        else:
            endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num)

        #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user())

    return endpoint, tb_pid