Esempio n. 1
0
def parallax_run_mpi(single_gpu_meta_graph_def,
                     run,
                     config,
                     is_test,
                     export_graph=True):

    mpi_meta_graph_def, tensor_or_op_name_to_replica_names = \
        graph_transform_mpi(single_gpu_meta_graph_def, config)
    worker_id = hvd.rank()
    num_workers = hvd.size()

    with tf.Graph().as_default() as graph_to_run:
        parallax_log.debug("Importing MPI graph on worker %d" % worker_id)
        tf.train.import_meta_graph(mpi_meta_graph_def)
        if export_graph:
            export_mpi_meta_graph(worker_id)

        ckpt_hooks = build_ckpt_hooks(
            config.get_ckpt_config(is_test)) if worker_id == 0 else None

        sess_config = config.sess_config
        if sess_config is None:
            sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
        with tf.train.MonitoredTrainingSession(
                is_chief=True,
                checkpoint_dir=config.get_ckpt_config(is_test).ckpt_dir
                if worker_id == 0 else None,
                # TODO: Allow user-defined hooks
                hooks=None,
                chief_only_hooks=ckpt_hooks,
                save_checkpoint_secs=None,
                save_summaries_steps=None,
                save_summaries_secs=None,
                config=sess_config) as sess:
            parallax_log.debug(
                "Created MonitoredTrainingSession for worker %d" % worker_id)
            _init_global_vars(sess)
            parallax_log.debug(
                "Finished initialization process, start training on worker %d"
                % worker_id)

            if is_test:
                parallax_log.debug('warmup is started')
                run(sess, NUM_ITERATIONS_FOR_WARMUP,
                    tensor_or_op_name_to_replica_names, num_workers, worker_id,
                    1)
                parallax_log.debug('warmup is ended')

            start_time = time.time()
            run(sess, config.num_iterations(is_test),
                tensor_or_op_name_to_replica_names, num_workers, worker_id, 1)
            end_time = time.time()

            if is_test:
                send_execution_time(config.resource_info['master'][0],
                                    worker_id, end_time - start_time)
Esempio n. 2
0
def parallax_run_mpi(single_gpu_meta_graph_def, config, export_graph=True):

    mpi_meta_graph_def, tensor_or_op_name_to_replica_names = \
        graph_transform_mpi(single_gpu_meta_graph_def, config)
    worker_id = hvd.rank()
    num_workers = hvd.size()

    with tf.Graph().as_default() as graph_to_run:
        parallax_log.debug("Importing MPI graph on worker %d" % worker_id)
        tf.train.import_meta_graph(mpi_meta_graph_def)
        if export_graph:
            export_mpi_meta_graph(worker_id)

        ckpt_hooks = build_ckpt_hooks(
            config.get_ckpt_config()) if worker_id == 0 else None

        sess_config = config.sess_config
        if sess_config is None:
            sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
        sess = tf.train.MonitoredTrainingSession(
            is_chief=True,
            checkpoint_dir=config.get_ckpt_config().ckpt_dir
            if worker_id == 0 else None,
            # TODO: Allow user-defined hooks
            hooks=None,
            chief_only_hooks=ckpt_hooks,
            save_checkpoint_secs=None,
            save_summaries_steps=None,
            save_summaries_secs=None,
            config=sess_config)

        parallax_log.debug("Created MonitoredTrainingSession for worker %d" %
                           worker_id)
        _init_global_vars(sess)
        parallax_log.debug(
            "Finished initialization process, start training on \
             worker %d" % worker_id)
        step = sess.run(tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0])
        sess_context = \
            ParallaxSessionContext(step,
                                   config.profile_config.profile_dir,
                                   config.profile_config.profile_steps,
                                   tensor_or_op_name_to_replica_names,
                                   1)
        sess_context.set_parallax_session_context()
        return sess, num_workers, worker_id, 1
Esempio n. 3
0
def parallax_run_mpi(single_gpu_meta_graph_def, config):
    hostname = os.getenv(PARALLAX_HOSTNAME, 0)
    create_profile_directory(config.profile_config.profile_dir,
                             config.profile_config.profile_worker,
                             config.resource_info, hostname)

    mpi_meta_graph_def, tensor_or_op_name_to_replica_names = \
        graph_transform_mpi(single_gpu_meta_graph_def, config)
    worker_id = hvd.rank()
    num_workers = hvd.size()

    if config.profile_config.profile_dir:
        append_task_info(config.profile_config.profile_dir, hostname,
                         ['worker:%d' % worker_id])

    with tf.Graph().as_default() as graph_to_run:
        parallax_log.debug("Importing MPI graph on worker %d" % worker_id)
        tf.train.import_meta_graph(mpi_meta_graph_def)

        if config.export_graph_path:
            export_meta_graph(config.export_graph_path, worker_id)

        if config.profile_config.profile_dir:
            path = os.path.join(config.profile_config.profile_dir, hostname,
                                'worker:%d' % worker_id)
            export_meta_graph(path, worker_id)

            if worker_id != config.profile_config.profile_worker:
                #Only one CUPTI profiler can run in a machine
                #See tensorflow/tensorflow/core/platform/default/device_tracer.cc:L452
                config.profile_config.profile_dir = None
            else:
                config.profile_config.profile_dir = \
                    os.path.join(config.profile_config.profile_dir, hostname,
                                 'worker:%d'%worker_id, 'run_meta')

        ckpt_hooks = build_ckpt_hooks(
            config.get_ckpt_config()) if worker_id == 0 else None

        sess_config = config.sess_config
        if sess_config is None:
            sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
        sess = tf.train.MonitoredTrainingSession(
            is_chief=True,
            checkpoint_dir=config.get_ckpt_config().ckpt_dir
            if worker_id == 0 else None,
            # TODO: Allow user-defined hooks
            hooks=None,
            chief_only_hooks=ckpt_hooks,
            save_checkpoint_secs=None,
            save_summaries_steps=None,
            save_summaries_secs=None,
            config=sess_config)

        parallax_log.debug("Created MonitoredTrainingSession for worker %d" %
                           worker_id)
        _init_global_vars(sess)
        parallax_log.debug(
            "Finished initialization process, start training on \
             worker %d" % worker_id)
        step = sess.run(tf.get_collection(tf.GraphKeys.GLOBAL_STEP)[0])
        sess_context = \
            ParallaxSessionContext(step,
                                   config.profile_config.profile_dir,
                                   config.profile_config.profile_steps,
                                   config.profile_config.profile_range,
                                   tensor_or_op_name_to_replica_names,
                                   1,
                                   config.resource_info['master'][0])
        sess_context.set_parallax_session_context()
        return sess, num_workers, worker_id, 1