Python RangePush Examples

Programming Language: Python

Namespace/Package Name: cupy.cuda.nvtx

Method/Function: RangePush

Examples at hotexamples.com: 6

Python RangePush - 6 examples found. These are the top rated real world Python examples of cupy.cuda.nvtx.RangePush extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: time_range.py Project: zori/chainer

def time_range(message, color_id=None, argb_color=None):
    """A context manager to describe the enclosed block as a nested range

    >>> from cupy import prof
    >>> with cupy.prof.time_range('some range in green', color_id=0):
    ...    # do something you want to measure
    ...    pass

    Args:
        message: Name of a range.
        color_id: range color ID
        argb_color: range color in ARGB (e.g. 0xFF00FF00 for green)

    .. seealso:: :func:`cupy.cuda.nvtx.RangePush`
        :func:`cupy.cuda.nvtx.RangePop`
    """
    if color_id is not None and argb_color is not None:
        raise ValueError('Only either color_id or argb_color can be specified')

    if argb_color is not None:
        nvtx.RangePushC(message, argb_color)
    else:
        if color_id is None:
            color_id = -1
        nvtx.RangePush(message, color_id)
    try:
        yield
    finally:
        nvtx.RangePop()

Example #2

Show file

File: mascarpone-tiramisu-tf-perf.py Project: alexliyang/ClimDeepLearn

    def read(self, datafile, labelfile):
        nvtx.RangePush("Read Data", 2)

        #set shape to none in the beginning
        shape = None

        #data
        #begin=time.time()
        with h5.File(self.path + '/' + datafile,
                     "r",
                     driver="core",
                     backing_store=False) as f:
            #get shape info
            shape = f['climate']['data'].shape
            #get min and max values and update stored values
            if self.update_on_read:
                self.minvals = np.minimum(
                    self.minvals, f['climate']['data_stats'][0, self.channels])
                self.maxvals = np.maximum(
                    self.maxvals, f['climate']['data_stats'][1, self.channels])
            #get data
            data = f['climate']['data'][:, :, self.channels].astype(np.float32)
            #data = data[:,:,self.channels]
            #do min/max normalization
            for c in range(len(self.channels)):
                data[:, :, c] = (data[:, :, c] - self.minvals[c]) / (
                    self.maxvals[c] - self.minvals[c])

            #transposition necessary because we went to NCHW
            data = np.transpose(data, [2, 0, 1])

        #label
        with h5.File(self.path + '/' + labelfile,
                     "r",
                     driver="core",
                     backing_store=False) as f:
            label = f['climate']['labels'][...].astype(np.int32)
        #end=time.time()
        #print "Time to read image %.3f s" % (end-begin)

        nvtx.RangePop()  # Load Data

        return data, label

Example #3

Show file

File: mascarpone-tiramisu-tf-perf.py Project: alexliyang/ClimDeepLearn

def main(blocks, weights, image_dir, checkpoint_dir, trn_sz):
    #init horovod
    nvtx.RangePush("init horovod", 1)
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))
    nvtx.RangePop()  # init horovod

    #parameters
    batch = 1
    channels = [0, 1, 2, 10]
    #blocks = [3,3,4,4,7,7,10]
    num_epochs = 3
    dtype = tf.float16

    #session config
    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=2,  #1
        intra_op_parallelism_threads=33,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)

    #get data
    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    path, trn_data, trn_labels, val_data, val_labels, tst_data, tst_labels = load_data(
        trn_sz)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("done.")
    with training_graph.as_default():
        nvtx.RangePush("TF Init", 3)
        #create datasets
        datafiles = tf.placeholder(tf.string, shape=[None])
        labelfiles = tf.placeholder(tf.string, shape=[None])
        trn_reader = h5_input_reader(path, channels, update_on_read=True)
        trn_dataset = create_dataset(trn_reader, datafiles, labelfiles, batch,
                                     num_epochs, comm_size, comm_rank, True)
        val_reader = h5_input_reader(path, channels, update_on_read=False)
        val_dataset = create_dataset(val_reader, datafiles, labelfiles, batch,
                                     1, comm_size, comm_rank)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (tf.float32, tf.int32),
            ((batch, len(channels), image_height, image_width),
             (batch, image_height, image_width)))
        next_elem = iterator.get_next()

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #set up model
        logit, prediction = create_tiramisu(3,
                                            next_elem[0],
                                            image_height,
                                            image_width,
                                            len(channels),
                                            loss_weights=weights,
                                            nb_layers_per_block=blocks,
                                            p=0.2,
                                            wd=1e-4,
                                            dtype=dtype)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=next_elem[1],
                                                      logits=logit)
        #if horovod:
        #    loss_average = hvd.allreduce(loss)/comm_size
        #else:
        #    loss_average = loss
        global_step = tf.train.get_or_create_global_step()
        #set up optimizer
        opt = tf.train.RMSPropOptimizer(learning_rate=1e-3)
        if horovod:
            opt = hvd.DistributedOptimizer(opt)
        train_op = opt.minimize(loss, global_step=global_step)
        #set up streaming metrics
        labels_one_hot = tf.contrib.layers.one_hot_encoding(next_elem[1], 3)
        iou_op, iou_update_op = tf.metrics.mean_iou(prediction,
                                                    labels_one_hot,
                                                    3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")

        #compute epochs and stuff:
        num_samples = trn_data.shape[0] // comm_size
        num_steps_per_epoch = num_samples // batch
        num_steps = num_epochs * num_steps_per_epoch

        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        if horovod:
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))
        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        #if comm_rank == 0:
        #    checkpoint_save_freq = num_steps_per_epoch
        #    checkpoint_saver = tf.train.Saver(max_to_keep = 1000)
        #    hooks.append(tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver))
        #    #create image dir if not exists
        #    if not os.path.isdir(image_dir):
        #        os.makedirs(image_dir)

        ##DEBUG
        ##summary
        #if comm_rank == 0:
        #    print("write graph for debugging")
        #    tf.summary.scalar("loss",loss)
        #    summary_op = tf.summary.merge_all()
        #    #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op))
        #    with tf.Session(config=sess_config) as sess:
        #        sess.run([init_op, init_local_op])
        #        #create iterator handles
        #        trn_handle = sess.run(trn_handle_string)
        #        #init iterators
        #        sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels})
        #        #summary:
        #        sess.run(summary_op, feed_dict={handle: trn_handle})
        #        #summary file writer
        #        summary_writer = tf.summary.FileWriter('./logs', sess.graph)
        ##DEBUG

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run(
                [init_op, init_local_op]
            )  #, options=tf.RunOptions(report_tensor_allocations_upon_oom=True))
            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string])
            #init iterators
            sess.run(trn_init_op,
                     feed_dict={
                         handle: trn_handle,
                         datafiles: trn_data,
                         labelfiles: trn_labels
                     })
            sess.run(val_init_op,
                     feed_dict={
                         handle: val_handle,
                         datafiles: val_data,
                         labelfiles: val_labels
                     })

            nvtx.RangePop()  # TF Init

            #do the training
            epoch = 1
            step = 1
            train_loss = 0.
            nvtx.RangePush("Training Loop", 4)
            nvtx.RangePush("Epoch", epoch)
            start_time = time.time()
            while not sess.should_stop():

                #training loop
                try:
                    nvtx.RangePush("Step", step)
                    #construct feed dict
                    _, _, train_steps, tmp_loss = sess.run(
                        [train_op, iou_update_op, global_step, loss],
                        feed_dict={handle: trn_handle})
                    train_steps_in_epoch = train_steps % num_steps_per_epoch
                    train_loss += tmp_loss
                    nvtx.RangePop()  # Step
                    step += 1

                    if train_steps_in_epoch > 0:
                        #print step report
                        print(
                            "REPORT: rank {}, training loss for step {} (of {}) is {}"
                            .format(comm_rank, train_steps, num_steps,
                                    train_loss / train_steps_in_epoch))
                    else:
                        end_time = time.time()
                        #print epoch report
                        train_loss /= num_steps_per_epoch
                        print(
                            "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, epoch duration {} s"
                            .format(comm_rank, epoch, num_epochs, train_loss,
                                    end_time - start_time))
                        nvtx.RangePush("IOU", 6)
                        iou_score = sess.run(iou_op)
                        nvtx.RangePop()
                        print(
                            "COMPLETED: rank {}, training IoU for epoch {} (of {}) is {}, epoch duration {} s"
                            .format(comm_rank, epoch, num_epochs, iou_score,
                                    end_time - start_time))
                        start_time = time.time()

                        #evaluation loop
                        eval_loss = 0.
                        eval_steps = 0
                        #update the input reader
                        val_reader.minvals = trn_reader.minvals
                        val_reader.maxvals = trn_reader.maxvals
                        nvtx.RangePush("Eval Loop", 7)
                        while True:
                            try:
                                #construct feed dict
                                _, tmp_loss, val_model_predictions, val_model_labels = sess.run(
                                    [
                                        iou_update_op, loss, prediction,
                                        next_elem[1]
                                    ],
                                    feed_dict={handle: val_handle})
                                if use_scipy:
                                    imsave(
                                        image_dir + '/test_pred_epoch' +
                                        str(epoch) + '_estep' +
                                        str(eval_steps) + '_rank' +
                                        str(comm_rank) + '.png',
                                        np.argmax(val_model_predictions[0,
                                                                        ...],
                                                  axis=2) * 100)
                                    imsave(
                                        image_dir + '/test_label_epoch' +
                                        str(epoch) + '_estep' +
                                        str(eval_steps) + '_rank' +
                                        str(comm_rank) + '.png',
                                        val_model_labels[0, ...] * 100)
                                else:
                                    np.save(
                                        image_dir + '/test_pred_epoch' +
                                        str(epoch) + '_estep' +
                                        str(eval_steps) + '_rank' +
                                        str(comm_rank) + '.npy',
                                        np.argmax(val_model_predictions[0,
                                                                        ...],
                                                  axis=2) * 100)
                                    np.save(
                                        image_dir + '/test_label_epoch' +
                                        str(epoch) + '_estep' +
                                        str(eval_steps) + '_rank' +
                                        str(comm_rank) + '.npy',
                                        val_model_labels[0, ...] * 100)
                                eval_loss += tmp_loss
                                eval_steps += 1
                            except tf.errors.OutOfRangeError:
                                eval_steps = np.max([eval_steps, 1])
                                eval_loss /= eval_steps
                                print(
                                    "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}"
                                    .format(comm_rank, epoch - 1, num_epochs,
                                            eval_loss))
                                iou_score = sess.run(iou_op)
                                print(
                                    "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}"
                                    .format(comm_rank, epoch - 1, num_epochs,
                                            iou_score))
                                sess.run(val_init_op,
                                         feed_dict={
                                             handle: val_handle,
                                             datafiles: val_data,
                                             labelfiles: val_labels
                                         })
                                break
                        nvtx.RangePop()  # Eval Loop

                        #reset counters
                        epoch += 1
                        train_loss = 0.
                        step = 0

                        nvtx.RangePop()  # Epoch
                        nvtx.RangePush("Epoch", epoch)

                except tf.errors.OutOfRangeError:
                    break

            nvtx.RangePop()  # Epoch
            nvtx.RangePop()  # Training Loop

Example #4

Show file

File: time_range.py Project: zori/chainer

 def __enter__(self):
     if self.argb_color is not None:
         nvtx.RangePushC(self.message, self.argb_color)
     else:
         nvtx.RangePush(self.message, self.color_id)
     return self

Example #5

Show file

File: test_nvtx.py Project: PhysicsTeacher13/CHAINER

 def test_RangePush(self):
     nvtx.RangePush("test:RangePush", 1)
     nvtx.RangePop()

Example #6

Show file

def main(input_path, blocks, weights, image_dir, checkpoint_dir, trn_sz,
         learning_rate, loss_type, fs_type, opt_type, batch, batchnorm,
         num_epochs, dtype, chkpt, filter_sz, growth, disable_training,
         enable_tf_timeline):
    options = None
    run_metadata = None
    many_runs_timeline = None

    timeline_trace_fp = open("timeline_trace.pickle", "wb")

    options, run_metadata, many_runs_timeline, min_timeline_step, max_timeline_step = \
        init_timeline_configs(enable_tf_timeline, tf.RunOptions.FULL_TRACE, -1, -1)

    global_time_logger = logger(-1, "Global Total Time", -1, True)
    global_time_logger.start_timer()

    #init horovod

    initialization_timer_logger = logger(-1, "Initialize Horovod", -1, True)
    initialization_timer_logger.start_timer()

    nvtx.RangePush("init horovod", 1)
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))
    nvtx.RangePop()  # init horovod

    initialization_timer_logger.set_rank(int(comm_rank))
    initialization_timer_logger.end_timer()

    global_time_logger.set_rank(int(comm_rank))

    #parameters
    channels = [0, 1, 2, 10]
    per_rank_output = False
    loss_print_interval = 1

    #session config

    initialization_timer_logger.start_timer(comm_rank, "Configure Session")

    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=6,  #1
        intra_op_parallelism_threads=1,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)

    initialization_timer_logger.end_timer()

    #get data

    initialization_timer_logger.start_timer(comm_rank, "Get Data")

    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    trn_data, val_data, tst_data = load_data(input_path, trn_sz, comm_rank)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("done.")

    initialization_timer_logger.end_timer()

    #print some stats
    if comm_rank == 0:
        print("Learning Rate: {}".format(learning_rate))
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Batch normalization: {}".format(batchnorm))
        print("Blocks: {}".format(blocks))
        print("Growth rate: {}".format(growth))
        print("Filter size: {}".format(filter_sz))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Optimizer type: {}".format(opt_type))
        print("Num training samples: {}".format(trn_data.shape[0]))
        print("Num validation samples: {}".format(val_data.shape[0]))

    io_training_time_logger = logger(comm_rank, "IO and Training", -1, True)
    io_training_time_logger.start_timer()

    with training_graph.as_default():
        nvtx.RangePush("TF Init", 3)
        #create readers
        trn_reader = h5_input_reader(input_path,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     comm_rank=comm_rank)
        val_reader = h5_input_reader(input_path,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     comm_rank=comm_rank)
        #create datasets
        if fs_type == "local":
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=False)
        else:
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype),
            ((batch, len(channels), image_height, image_width),
             (batch, image_height, image_width),
             (batch, image_height, image_width)))
        next_elem = iterator.get_next()

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #set up model
        logit, prediction = create_tiramisu(3,
                                            next_elem[0],
                                            image_height,
                                            image_width,
                                            len(channels),
                                            loss_weights=weights,
                                            nb_layers_per_block=blocks,
                                            p=0.2,
                                            wd=1e-4,
                                            dtype=dtype,
                                            batchnorm=batchnorm,
                                            growth_rate=growth,
                                            filter_sz=filter_sz,
                                            comm_rank=comm_rank)

        #set up loss
        labels_one_hot = tf.cast(tf.contrib.layers.one_hot_encoding(
            next_elem[1], 3),
                                 dtype=dtype)
        loss = None
        if loss_type == "weighted":
            loss = tf.losses.softmax_cross_entropy(
                onehot_labels=labels_one_hot,
                logits=logit,
                weights=next_elem[2])
        elif loss_type == "focal":
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)
        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))
        if horovod:
            loss_avg = hvd.allreduce(tf.cast(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)

        #set up global step
        global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if opt_type.startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op = get_larc_optimizer(opt_type.split("-")[1],
                                          loss,
                                          global_step,
                                          learning_rate,
                                          LARC_mode="clip",
                                          LARC_eta=0.002,
                                          LARC_epsilon=1. / 16000.)
        else:
            train_op = get_optimizer(opt_type, loss, global_step,
                                     learning_rate)
        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)

        #compute epochs and stuff:
        if fs_type == "local":
            num_samples = trn_data.shape[0] // comm_local_size
        else:
            num_samples = trn_data.shape[0] // comm_size
        #num_steps_per_epoch = num_samples // batch
        num_steps_per_epoch = 10
        num_steps = num_epochs * num_steps_per_epoch
        if per_rank_output:
            print("Rank {} does {} steps per epoch".format(
                comm_rank, num_steps_per_epoch))

        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        #bcast init for bcasting the model after start
        init_bcast = hvd.broadcast_global_variables(0)
        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = num_steps_per_epoch * 2
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            listener = checkpoint_listener(comm_rank, True)
            hooks.append(
                tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir,
                                             save_steps=checkpoint_save_freq,
                                             saver=checkpoint_saver,
                                             listeners=[listener]))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        ##DEBUG
        ##summary
        #if comm_rank == 0:
        #    print("write graph for debugging")
        #    tf.summary.scalar("loss",loss)
        #    summary_op = tf.summary.merge_all()
        #    #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op))
        #    with tf.Session(config=sess_config) as sess:
        #        sess.run([init_op, init_local_op])
        #        #create iterator handles
        #        trn_handle = sess.run(trn_handle_string)
        #        #init iterators
        #        sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels})
        #        #summary:
        #        sess.run(summary_op, feed_dict={handle: trn_handle})
        #        #summary file writer
        #        summary_writer = tf.summary.FileWriter('./logs', sess.graph)
        ##DEBUG

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])

            #restore from checkpoint:
            if comm_rank == 0:
                load_model(sess, checkpoint_saver, checkpoint_dir, comm_rank)
            #broadcast loaded model variables
            sess.run(init_bcast)

            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string],
                options=options,
                run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "create_iterator_handle.json")

            #init iterators
            sess.run(trn_init_op,
                     feed_dict={handle: trn_handle},
                     options=options,
                     run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "init_train_iterator_handle.json")

            sess.run(val_init_op,
                     feed_dict={handle: val_handle},
                     options=options,
                     run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "init_val_iterator_handle.json")

            nvtx.RangePop()  # TF Init

            # do the training
            epoch = 1
            step = 1
            train_loss = 0.
            nvtx.RangePush("Training Loop", 4)
            nvtx.RangePush("Epoch", epoch)
            start_time = time.time()

            training_loop_timer_logger = logger(comm_rank, "Training Loop", -1,
                                                True)
            training_loop_timer_logger.start_timer()

            train_steps = 0
            while not (sess.should_stop()):
                #training loop
                try:
                    training_iteration_time_logger = logger(
                        comm_rank, "Training Iteration", epoch, True)
                    training_iteration_time_logger.start_timer()

                    nvtx.RangePush("Step", step)

                    if disable_training:
                        train_steps = sess.run([global_step],
                                               feed_dict={handle: trn_handle},
                                               options=options,
                                               run_metadata=run_metadata)

                        update_timeline_in_range(
                            enable_tf_timeline, run_metadata,
                            many_runs_timeline, train_steps[0],
                            "train_" + str(global_step) + ".json",
                            min_timeline_step, max_timeline_step)

                        train_steps_in_epoch = train_steps[
                            0] % num_steps_per_epoch

                        # do the validation phase
                        if train_steps_in_epoch == 0:
                            eval_steps = 0
                            while True:
                                try:
                                    sess.run([next_elem[1]],
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        "val_dict" + str(eval_steps) + ".json")

                                    eval_steps += 1
                                except tf.errors.OutOfRangeError:
                                    sess.run(val_init_op,
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline, "val_dict_out_" +
                                        str(eval_steps) + ".json")

                                    break

                    else:
                        # construct feed dict
                        _, train_steps, tmp_loss = sess.run(
                            [
                                train_op, global_step,
                                (loss if per_rank_output else loss_avg)
                            ],
                            feed_dict={handle: trn_handle},
                            options=options,
                            run_metadata=run_metadata)

                        update_timeline_in_range(
                            enable_tf_timeline, run_metadata,
                            many_runs_timeline, train_steps,
                            "val_" + str(global_step) + ".json",
                            min_timeline_step, max_timeline_step)

                        if comm_rank == 0:
                            step_trace_fp = open(
                                "train_step_trace_" + str(global_step) +
                                ".pickle", "wb")
                            pickle.dump(run_metadata, step_trace_fp)

                        train_steps_in_epoch = train_steps % num_steps_per_epoch
                        train_loss += tmp_loss
                        nvtx.RangePop()  # Step
                        step += 1

                        #print step report
                        eff_steps = train_steps_in_epoch if (
                            train_steps_in_epoch > 0) else num_steps_per_epoch
                        if (train_steps % loss_print_interval) == 0:
                            if per_rank_output:
                                print(
                                    "REPORT: rank {}, training loss for step {} (of {}) is {}, time {}"
                                    .format(comm_rank, train_steps, num_steps,
                                            train_loss / eff_steps,
                                            time.time() - start_time))
                            else:
                                if comm_rank == 0:
                                    print(
                                        "REPORT: training loss for step {} (of {}) is {}, time {}"
                                        .format(train_steps, num_steps,
                                                train_loss / eff_steps,
                                                time.time() - start_time))

                        #do the validation phase
                        if train_steps_in_epoch == 0:
                            end_time = time.time()
                            #print epoch report
                            train_loss /= num_steps_per_epoch
                            if per_rank_output:
                                print(
                                    "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {} s"
                                    .format(comm_rank, epoch, num_epochs,
                                            train_loss,
                                            time.time() - start_time))
                            else:
                                if comm_rank == 0:
                                    print(
                                        "COMPLETED: training loss for epoch {} (of {}) is {}, time {} s"
                                        .format(epoch, num_epochs, train_loss,
                                                time.time() - start_time))

                            #evaluation loop
                            eval_loss = 0.
                            eval_steps = 0
                            nvtx.RangePush("Eval Loop", 7)
                            timeline_help_count = 0
                            while True:
                                try:
                                    #construct feed dict
                                    _, tmp_loss, val_model_predictions, val_model_labels = sess.run(
                                        [
                                            iou_update_op,
                                            (loss
                                             if per_rank_output else loss_avg),
                                            prediction, next_elem[1]
                                        ],
                                        feed_dict={handle: val_handle},
                                        options=options,
                                        run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        timeline_help_count,
                                        "train_" + str(global_step) + ".json",
                                        min_timeline_step, max_timeline_step)

                                    if comm_rank == 0:
                                        step_trace_fp = open(
                                            "validation_step_trace_" +
                                            str(global_step) + ".pickle", "wb")
                                        pickle.dump(run_metadata,
                                                    step_trace_fp)

                                    timeline_help_count += 1

                                    #print some images
                                    if comm_rank == 0:
                                        if have_imsave:
                                            imsave(
                                                image_dir +
                                                '/test_pred_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2) * 100)
                                            imsave(
                                                image_dir +
                                                '/test_label_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                val_model_labels[0, ...] * 100)
                                            imsave(
                                                image_dir +
                                                '/test_combined_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                colormap[
                                                    val_model_labels[0, ...],
                                                    np.argmax(
                                                        val_model_predictions[
                                                            0, ...],
                                                        axis=2)])
                                        else:
                                            np.save(
                                                image_dir +
                                                '/test_pred_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.npy',
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2) * 100)
                                            np.save(
                                                image_dir +
                                                '/test_label_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.npy',
                                                val_model_labels[0, ...] * 100)

                                    eval_loss += tmp_loss
                                    eval_steps += 1
                                except tf.errors.OutOfRangeError:
                                    eval_steps = np.max([eval_steps, 1])
                                    eval_loss /= eval_steps
                                    if per_rank_output:
                                        print(
                                            "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}"
                                            .format(comm_rank, epoch,
                                                    num_epochs, eval_loss))
                                    else:
                                        if comm_rank == 0:
                                            print(
                                                "COMPLETED: evaluation loss for epoch {} (of {}) is {}"
                                                .format(
                                                    epoch, num_epochs,
                                                    eval_loss))
                                    if per_rank_output:
                                        iou_score = sess.run(iou_op)

                                        print(
                                            "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}"
                                            .format(comm_rank, epoch,
                                                    num_epochs, iou_score))
                                    else:
                                        iou_score = sess.run(iou_avg)

                                        if comm_rank == 0:
                                            print(
                                                "COMPLETED: evaluation IoU for epoch {} (of {}) is {}"
                                                .format(
                                                    epoch, num_epochs,
                                                    iou_score))
                                    sess.run(iou_reset_op)

                                    sess.run(val_init_op,
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        "train_" + str(global_step) + ".json")

                                    if comm_rank == 0:
                                        step_trace_fp = open(
                                            "validation_step_trace_out.pickle",
                                            "wb")
                                        pickle.dump(run_metadata,
                                                    step_trace_fp)

                                    break
                            nvtx.RangePop()  # Eval Loop

                    if enable_tf_timeline:
                        many_runs_timeline.save('Timeliner_output.json')

                    # reset counters
                    epoch += 1
                    train_loss = 0.
                    step = 0

                    nvtx.RangePop()  # Epoch
                    nvtx.RangePush("Epoch", epoch)

                    training_iteration_time_logger.end_timer()

                except tf.errors.OutOfRangeError:
                    break

            nvtx.RangePop()  # Epoch
            nvtx.RangePop()  # Training Loop

            training_loop_timer_logger.end_timer()

    if enable_tf_timeline:
        many_runs_timeline.save('Timeliner_output.json')

    io_training_time_logger.end_timer()
    global_time_logger.end_timer()