Exemple #1
0
def exit_handler():
    global experiment_json
    global elastic_id
    if running and experiment_json != None:
        experiment_json = json.loads(experiment_json)
        experiment_json['status'] = "KILLED"
        experiment_json['finished'] = datetime.now().isoformat()
        experiment_json = json.dumps(experiment_json)
        util.put_elastic(hopshdfs.project_name(), app_id,
                         str('dist' + str(elastic_id)), experiment_json)
Exemple #2
0
def exception_handler():
    global running
    global experiment_json
    if running and experiment_json != None:
        experiment_json = json.loads(experiment_json)
        experiment_json['status'] = "FAILED"
        experiment_json['finished'] = datetime.now().isoformat()
        experiment_json = json.dumps(experiment_json)
        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)
Exemple #3
0
def end(metric=None):
    global running
    global experiment_json
    global elastic_id
    global driver_tensorboard_hdfs_path
    global app_id
    if not running:
        raise RuntimeError(
            "An experiment is not running. Did you forget to call experiment.end()?"
        )
    try:
        if metric:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       str(metric))
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
        else:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       None)
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False
        handle = hopshdfs.get()

        if tensorboard.tb_pid != 0:
            subprocess.Popen(["kill", str(tensorboard.tb_pid)])

        if tensorboard.local_logdir_bool:
            local_tb = tensorboard.local_logdir_path
            util.store_local_tensorboard(local_tb, tensorboard.events_logdir)

        if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \
                and handle.exists(tensorboard.endpoint):
            handle.delete(tensorboard.endpoint)
        hopshdfs.kill_logger()
Exemple #4
0
def run(sc,
        map_fun,
        tf_args,
        num_executors,
        num_ps,
        tensorboard=False,
        input_mode=InputMode.TENSORFLOW,
        log_dir=None,
        driver_ps_nodes=False,
        master_node=None,
        reservation_timeout=600,
        name='no-name',
        local_logdir=False,
        versioned_resources=None,
        description=None,
        queues=['input', 'output', 'error']):
    """Starts the TensorFlowOnSpark cluster and Runs the TensorFlow "main" function on the Spark executors

  Args:
    :sc: SparkContext
    :map_fun: user-supplied TensorFlow "main" function
    :tf_args: ``argparse`` args, or command-line ``ARGV``.  These will be passed to the ``map_fun``.
    :num_executors: number of Spark executors.  This should match your Spark job's ``--num_executors``.
    :num_ps: number of Spark executors which are reserved for TensorFlow PS nodes.  All other executors will be used as TensorFlow worker nodes.
    :tensorboard: boolean indicating if the chief worker should spawn a Tensorboard server.
    :input_mode: TFCluster.InputMode
    :log_dir: directory to save tensorboard event logs.  If None, defaults to a fixed path on local filesystem.
    :driver_ps_nodes: run the PS nodes on the driver locally instead of on the spark executors; this help maximizing computing resources (esp. GPU). You will need to set cluster_size = num_executors + num_ps
    :master_node: name of the "master" or "chief" node in the cluster_template, used for `tf.estimator` applications.
    :reservation_timeout: number of seconds after which cluster reservation times out (600 sec default)
    :queues: *INTERNAL_USE*

  Returns:
    A TFCluster object representing the started cluster.
  """

    #in hopsworks we want the tensorboard to always be true:
    global elastic_id
    global running
    global run_id
    tb = True
    elastic_id = elastic_id + 1
    run_id = run_id + 1
    running = True

    logging.info(
        "Reserving TFSparkNodes {0}".format("w/ TensorBoard" if tb else ""))
    assert num_ps < num_executors

    if driver_ps_nodes:
        raise Exception(
            'running PS nodes on driver is not supported and not needed on Hops Hadoop, since we have GPU scheduling.'
        )

    if log_dir:
        raise Exception(
            'No need to specify log_dir directory, we save TensorBoard events in the directory returned by tensorboard.logdir for you'
        )

    # build a cluster_spec template using worker_nums
    cluster_template = {}
    cluster_template['ps'] = range(num_ps)
    if master_node is None:
        cluster_template['worker'] = range(num_ps, num_executors)
    else:
        cluster_template[master_node] = range(num_ps, num_ps + 1)
        if num_executors > num_ps + 1:
            cluster_template['worker'] = range(num_ps + 1, num_executors)
    logging.info("cluster_template: {}".format(cluster_template))

    # get default filesystem from spark
    defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    # strip trailing "root" slash from "file:///" to be consistent w/ "hdfs://..."
    if defaultFS.startswith(
            "file://") and len(defaultFS) > 7 and defaultFS.endswith("/"):
        defaultFS = defaultFS[:-1]

    # get current working dir of spark launch
    working_dir = os.getcwd()

    # start a server to listen for reservations and broadcast cluster_spec
    server = reservation.Server(num_executors)
    server_addr = server.start()

    # start TF nodes on all executors
    logging.info("Starting TensorFlow on executors")
    cluster_meta = {
        'id': random.getrandbits(64),
        'cluster_template': cluster_template,
        'num_executors': num_executors,
        'default_fs': defaultFS,
        'working_dir': working_dir,
        'server_addr': server_addr
    }

    nodeRDD = sc.parallelize(range(num_executors), num_executors)
    global app_id
    app_id = sc.applicationId
    global experiment_json

    versioned_path = util.version_resources(versioned_resources,
                                            get_logdir(app_id))

    experiment_json = None
    experiment_json = util.populate_experiment(sc, name, 'TFCluster', 'run',
                                               get_logdir(app_id), None,
                                               versioned_path, description)

    util.put_elastic(hopshdfs.project_name(), app_id,
                     str('dist' + str(elastic_id)), experiment_json)

    # start TF on a background thread (on Spark driver) to allow for feeding job

    def _start(status):
        try:
            nodeRDD.foreachPartition(
                TFSparkNode.run(map_fun,
                                tf_args,
                                cluster_meta,
                                tb,
                                None,
                                app_id,
                                run_id,
                                queues,
                                local_logdir=local_logdir,
                                background=(input_mode == InputMode.SPARK)))
        except Exception as e:
            logging.error("Exception in TF background thread")
            status['error'] = str(e)
            exception_handler()

    t = threading.Thread(target=_start, args=(tf_status, ))
    # run as daemon thread so that in spark mode main thread can exit
    # if feeder spark stage fails and main thread can't do explicit shutdown
    t.daemon = True

    t.start()

    # wait for executors to check GPU presence
    logging.info("Waiting for GPU presence check to start")
    gpus_present = server.await_gpu_check()
    logging.info("All GPU checks completed")

    # wait for executors to register and start TFNodes before continuing
    logging.info("Waiting for TFSparkNodes to start")
    cluster_info = server.await_reservations(sc, tf_status,
                                             reservation_timeout)
    logging.info("All TFSparkNodes started")

    # print cluster_info and extract TensorBoard URL
    tb_url = None
    for node in cluster_info:
        logging.info(node)
        if node['tb_port'] != 0:
            tb_url = "http://{0}:{1}".format(node['host'], node['tb_port'])

    if tb_url is not None:
        logging.info(
            "========================================================================================"
        )
        logging.info("")
        logging.info("TensorBoard running at:       {0}".format(tb_url))
        logging.info("")
        logging.info(
            "========================================================================================"
        )

    # since our "primary key" for each executor's TFManager is (host, executor_id), sanity check for duplicates

    # Note: this may occur if Spark retries failed Python tasks on the same executor.
    tb_nodes = set()
    for node in cluster_info:
        node_id = (node['host'], node['executor_id'])
        if node_id in tb_nodes:
            raise Exception(
                "Duplicate cluster node id detected (host={0}, executor_id={1})"
                .format(node_id[0], node_id[1]) + "Please ensure that:\n" +
                "1. Number of executors >= number of TensorFlow nodes\n" +
                "2. Number of tasks per executors is 1\n" +
                "3, TFCluster.shutdown() is successfully invoked when done.")
        else:
            tb_nodes.add(node_id)

    # create TFCluster object
    cluster = TFCluster()
    cluster.sc = sc
    cluster.meta = cluster_meta
    cluster.nodeRDD = nodeRDD
    cluster.cluster_info = cluster_info
    cluster.cluster_meta = cluster_meta
    cluster.input_mode = input_mode
    cluster.queues = queues
    cluster.server = server

    return cluster
Exemple #5
0
    def shutdown(self, ssc=None):
        """Stops the distributed TensorFlow cluster.

    Args:
      :ssc: *For Streaming applications only*. Spark StreamingContext
    """
        logging.info("Stopping TensorFlow nodes")

        # identify ps/workers
        ps_list, worker_list = [], []
        for node in self.cluster_info:
            if node['job_name'] == 'ps':
                ps_list.append(node)
            else:
                worker_list.append(node)

        if ssc is not None:
            # Spark Streaming
            done = False
            while not done:
                done = ssc.awaitTerminationOrTimeout(1)
                if not done and self.server.done:
                    logging.info("Server done, stopping StreamingContext")
                    ssc.stop(stopSparkContext=False, stopGraceFully=True)
                done = done or self.server.done
        else:
            # in TENSORFLOW mode, there is no "data feeding" job, only a "start" job, so we must wait for the TensorFlow workers
            # to complete all tasks, while accounting for any PS tasks which run indefinitely.
            if self.input_mode == InputMode.TENSORFLOW:
                count = 0
                done = False
                while not done:
                    st = self.sc.statusTracker()
                    jobs = st.getActiveJobsIds()
                    if len(jobs) > 0:
                        stages = st.getActiveStageIds()
                        for i in stages:
                            si = st.getStageInfo(i)
                            if si.numActiveTasks == len(ps_list):
                                # if we only have PS tasks left, check that we see this condition a couple times
                                count += 1
                                done = (count >= 3)
                                time.sleep(5)
                    else:
                        done = True
                        global running
                        running = False

            # shutdown queues and managers for "worker" executors.
            # note: in SPARK mode, this job will immediately queue up behind the "data feeding" job.
            # in TENSORFLOW mode, this will only run after all workers have finished.
            workers = len(worker_list)
            workerRDD = self.sc.parallelize(range(workers), workers)
            workerRDD.foreachPartition(
                TFSparkNode.shutdown(self.cluster_info, self.queues))

        # exit Spark application w/ err status if TF job had any errors
        if 'error' in tf_status:
            logging.error("Exiting Spark application with error status.")
            exception_handler()
            self.sc.cancelAllJobs()
            #self.sc.stop()
            #sys.exit(1)
        global experiment_json
        global app_id
        experiment_json = util.finalize_experiment(experiment_json, None, None)

        util.put_elastic(hopshdfs.project_name(), app_id,
                         str('dist' + str(elastic_id)), experiment_json)

        logging.info("Shutting down cluster")
        # shutdown queues and managers for "PS" executors.
        # note: we have to connect/shutdown from the spark driver, because these executors are "busy" and won't accept any other tasks.
        for node in ps_list:
            addr = node['addr']
            authkey = node['authkey']
            m = TFManager.connect(addr, authkey)
            q = m.get_queue('control')
            q.put(None)
            q.join()

        # wait for all jobs to finish
        done = False
        while not done:
            time.sleep(5)
            st = self.sc.statusTracker()
            jobs = st.getActiveJobsIds()
            if len(jobs) == 0:
                break

        def tensorboard_url(self):
            """
      Utility function to get Tensorboard URL
      """
            tb_url = None
            for node in self.cluster_info:
                if node['tb_port'] != 0 and node[
                        'job_name'] == 'worker' and node['task_index'] == 0:
                    tb_url = "http://{0}:{1}".format(node['host'],
                                                     node['tb_port'])
            return tb_url
Exemple #6
0
def begin(spark,
          name='no-name',
          local_logdir=False,
          versioned_resources=None,
          description=None):
    """ Start an experiment

    Args:
      :spark_session: SparkSession object
      :name: (optional) name of the job
    """
    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.stop() to stop it."
        )

    try:
        global app_id
        global experiment_json
        global elastic_id
        global run_id
        global driver_tensorboard_hdfs_path

        running = True

        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        run_id = run_id + 1

        versioned_path = util.version_resources(versioned_resources,
                                                get_logdir(app_id))

        experiment_json = None

        experiment_json = util.populate_experiment(sc, name,
                                                   'experiment', 'begin',
                                                   get_logdir(app_id), None,
                                                   versioned_path, description)

        util.version_resources(versioned_resources, get_logdir(app_id))

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'begin')

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())

        hopshdfs.init_logger()

        driver_tensorboard_hdfs_path, _ = tensorboard.register(
            hdfs_exec_logdir,
            hdfs_appid_logdir,
            0,
            local_logdir=local_logdir,
            tensorboard_driver=True)
    except:
        exception_handler()
        raise

    return
Exemple #7
0
def horovod(spark,
            notebook,
            name='no-name',
            local_logdir=False,
            versioned_resources=None,
            description=None):
    """ Run the notebooks specified in the path as input to horovod

    Args:
      :spark_session: SparkSession object
      :notebook: Notebook path
      :name: (optional) name of the job
    """
    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        allreduce.run_id = allreduce.run_id + 1

        versioned_path = util.version_resources(versioned_resources,
                                                allreduce.get_logdir(app_id))

        experiment_json = None
        experiment_json = util.populate_experiment(
            sc, name, 'experiment', 'horovod', allreduce.get_logdir(app_id),
            None, versioned_path, description)

        util.version_resources(versioned_resources,
                               allreduce.get_logdir(app_id))

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

        tensorboard_logdir = allreduce.launch(sc,
                                              notebook,
                                              local_logdir=local_logdir)

        experiment_json = util.finalize_experiment(experiment_json, None, None)

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False

    return tensorboard_logdir
Exemple #8
0
def grid_search(spark,
                map_fun,
                args_dict,
                direction='max',
                name='no-name',
                local_logdir=False,
                versioned_resources=None,
                description=None):
    """ Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
      :spark_session: SparkSession object
      :map_fun: The TensorFlow function to run
      :args_dict: A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job
      :direction: 'max' to maximize, 'min' to minimize
      :name: (optional) name of the job
    """
    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        gs.run_id = gs.run_id + 1

        versioned_path = util.version_resources(versioned_resources,
                                                gs.get_logdir(app_id))

        experiment_json = util.populate_experiment(sc, name, 'experiment',
                                                   'grid_search',
                                                   gs.get_logdir(app_id),
                                                   json.dumps(args_dict),
                                                   versioned_path, description)

        util.version_resources(versioned_resources, gs.get_logdir(app_id))

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

        grid_params = util.grid_params(args_dict)

        tensorboard_logdir, param, metric = gs._grid_launch(
            sc,
            map_fun,
            grid_params,
            direction=direction,
            local_logdir=local_logdir)

        experiment_json = util.finalize_experiment(experiment_json, param,
                                                   metric)

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)
    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False

    return tensorboard_logdir
Exemple #9
0
def evolutionary_search(spark,
                        objective_function,
                        search_dict,
                        direction='max',
                        generations=10,
                        population=10,
                        mutation=0.5,
                        crossover=0.7,
                        cleanup_generations=False,
                        name='no-name',
                        local_logdir=False,
                        versioned_resources=None,
                        description=None):
    """ Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
      :spark_session: SparkSession object
      :map_fun: The TensorFlow function to run
      :search_dict: (optional) A dictionary containing differential evolutionary boundaries
    """
    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        diff_evo.run_id = diff_evo.run_id + 1

        versioned_path = util.version_resources(versioned_resources,
                                                diff_evo.get_logdir(app_id))

        experiment_json = None
        experiment_json = util.populate_experiment(sc, name, 'experiment',
                                                   'evolutionary_search',
                                                   diff_evo.get_logdir(app_id),
                                                   json.dumps(search_dict),
                                                   versioned_path, description)

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

        tensorboard_logdir, best_param, best_metric = diff_evo._search(
            spark,
            objective_function,
            search_dict,
            direction=direction,
            generations=generations,
            popsize=population,
            mutation=mutation,
            crossover=crossover,
            cleanup_generations=cleanup_generations,
            local_logdir=local_logdir)

        experiment_json = util.finalize_experiment(experiment_json, best_param,
                                                   best_metric)

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

        best_param_dict = util.convert_to_dict(best_param)

    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False

    return tensorboard_logdir, best_param_dict
Exemple #10
0
def launch(spark,
           map_fun,
           args_dict=None,
           name='no-name',
           local_logdir=False,
           versioned_resources=None,
           description=None):
    """ Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
      :spark_session: SparkSession object
      :map_fun: The TensorFlow function to run
      :args_dict: (optional) A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job
      :name: (optional) name of the job
    """
    global running
    if running:
        raise RuntimeError(
            "An experiment is currently running. Please call experiment.end() to stop it."
        )

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = spark.sparkContext
        app_id = str(sc.applicationId)

        launcher.run_id = launcher.run_id + 1

        versioned_path = util.version_resources(versioned_resources,
                                                launcher.get_logdir(app_id))

        experiment_json = None
        if args_dict:
            experiment_json = util.populate_experiment(
                sc, name,
                'experiment', 'launcher', launcher.get_logdir(app_id),
                json.dumps(args_dict), versioned_path, description)
        else:
            experiment_json = util.populate_experiment(
                sc, name, 'experiment', 'launcher',
                launcher.get_logdir(app_id), None, versioned_path, description)

        util.version_resources(versioned_resources,
                               launcher.get_logdir(app_id))

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

        retval, tensorboard_logdir = launcher.launch(sc, map_fun, args_dict,
                                                     local_logdir)

        if retval:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       retval)
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
            return tensorboard_logdir

        experiment_json = util.finalize_experiment(experiment_json, None, None)

        util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                         experiment_json)

    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False

    return tensorboard_logdir