Ejemplo n.º 1
0
    def _eval(self):
        logdir = args.logdir
        if cfg.TRAINER == 'replicated':
            all_results = multithread_eval_coco(self.dataflows, self.predictors)
        else:
            filenames = [os.path.join(
                logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
            ) for rank in range(hvd.local_size())]

            if self._horovod_run_eval:
                local_results = eval_coco(self.dataflow, self.predictor)
                fname = filenames[hvd.local_rank()]
                with open(fname, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for fname in filenames:
                with open(fname, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(fname)

        output_file = os.path.join(
            logdir, 'outputs{}.json'.format(self.global_step))
        with open(output_file, 'w') as f:
            json.dump(all_results, f)
        try:
            scores = print_coco_metrics(output_file)
            for k, v in scores.items():
                self.trainer.monitors.put_scalar(k, v)
        except Exception:
            logger.exception("Exception in COCO evaluation.")
Ejemplo n.º 2
0
    def _eval(self):
        logdir = self._output_dir
        if cfg.TRAINER == 'replicated':
            all_results = multithread_predict_dataflow(self.dataflows,
                                                       self.predictors)
        else:
            filenames = [
                os.path.join(
                    logdir,
                    'outputs{}-part{}.json'.format(self.global_step, rank))
                for rank in range(hvd.local_size())
            ]

            if self._horovod_run_eval:
                local_results = predict_dataflow(self.dataflow, self.predictor)
                fname = filenames[hvd.local_rank()]
                with open(fname, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for fname in filenames:
                with open(fname, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(fname)

        scores = DatasetRegistry.get(
            self._eval_dataset).eval_inference_results(all_results)
        for k, v in scores.items():
            self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)
Ejemplo n.º 3
0
 def __init__(self, config):
   """
   :param Config config:
   """
   # noinspection PyUnresolvedReferences,PyPackageRequirements
   import horovod.tensorflow as hvd
   hvd.init()
   print(
     "Horovod initialized. Hostname %s, pid %i, rank %i / size %i, local rank %i / local size %i." % (
       socket.gethostname(), os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
   self._config = config
   self._hvd_mod = hvd
   self._local_rank = hvd.local_rank()
   self._local_size = hvd.local_size()
   self._rank = hvd.rank()
   self._size = hvd.size()
Ejemplo n.º 4
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750
            buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)]

            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu if buggy_tf else num_gpu * 2
            self.predictors = [
                self._build_predictor(k % num_gpu)
                for k in range(self.num_predictor)
            ]
            self.dataflows = [
                get_eval_dataflow(self._eval_dataset,
                                  shard=k,
                                  num_shards=self.num_predictor)
                for k in range(self.num_predictor)
            ]
        else:
            # Only eval on the first machine,
            # Because evaluation assumes that all horovod workers share the filesystem.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_predictor(0)
                self.dataflow = get_eval_dataflow(self._eval_dataset,
                                                  shard=hvd.local_rank(),
                                                  num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Ejemplo n.º 5
0
    def set_up_graph(self, trainer: tp.Trainer) -> None:
        self.trainer = trainer
        if self.trainer_type == "replicated":
            # Use multiple predictor threads per GPU to get better throughput.
            self.num_predictor = self.num_gpus * 2
            self.predictors = [
                self._build_predictor(k % self.num_gpus) for k in range(self.num_predictor)
            ]
            self.dataflows = [
                get_eval_dataflow(  # type: ignore
                    self._eval_dataset,
                    self.is_aws,
                    self.is_gcs,
                    shard=k,
                    num_shards=self.num_predictor,
                )
                for k in range(self.num_predictor)
            ]
        else:
            if self.machine_rank == 0:
                # Run validation on one machine.
                self.predictor = self._build_predictor(0)
                self.dataflow = get_eval_dataflow(
                    self._eval_dataset,
                    self.is_aws,
                    self.is_gcs,
                    shard=hvd.local_rank(),
                    num_shards=hvd.local_size(),
                )

            # All workers must take part in this barrier, even if they
            # are not performing validation.
            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Ejemplo n.º 6
0
    def local_size(cls, *args):
        """Get the number of workers at the current node."""

        try:
            return mgw.local_size(*args)
        except NameError:
            raise NameError('module <mgw> not imported')
Ejemplo n.º 7
0
def print_header():
    import horovod.tensorflow as hvd
    if hvd.rank() == 0:
        text = """

               _           _   _                     _
              (_)         | | | |                   | |
      ___  ___ _ _ __ ___ | | | |__   ___ _ __   ___| |__
     / __|/ __| | '_ ` _ \| | | '_ \ / _ \ '_ \ / __| '_ \\
     \__ \ (__| | | | | | | | | |_) |  __/ | | | (__| | | |
     |___/\___|_|_| |_| |_|_| |_.__/ \___|_| |_|\___|_| |_|



        """
        sys.stdout.write(text)
        sys.stdout.write("\n\n")

    LOGGER.info('Version: %s', sciml_bench.__version__)

    from mpi4py import MPI
    data = (MPI.Get_processor_name(), hvd.local_size())
    _comm = MPI.COMM_WORLD
    data = _comm.bcast(data, root=0)

    data = [data] if not isinstance(data, list) else data

    plurality = 'es' if len(data) > 1 else ''
    for node_name, local_size in data:
        LOGGER.info('%s has %s process%s', node_name, local_size, plurality)
Ejemplo n.º 8
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu * 2
            self.predictors = [
                self._build_coco_predictor(k % num_gpu)
                for k in range(self.num_predictor)
            ]
            self.dataflows = [
                get_eval_dataflow(shard=k, num_shards=self.num_predictor)
                for k in range(self.num_predictor)
            ]
        else:
            if hvd.size() > hvd.local_size():
                logger.warn(
                    "Distributed evaluation with horovod is unstable. Sometimes MPI hangs for unknown reasons."
                )
            self.predictor = self._build_coco_predictor(0)
            self.dataflow = get_eval_dataflow(shard=hvd.rank(),
                                              num_shards=hvd.size())

            # use uint8 to aggregate strings
            self.local_result_tensor = tf.placeholder(
                tf.uint8, shape=[None], name='local_result_string')
            self.concat_results = hvd.allgather(self.local_result_tensor,
                                                name='concat_results')
            local_size = tf.expand_dims(tf.size(self.local_result_tensor), 0)
            self.string_lens = hvd.allgather(local_size, name='concat_sizes')
Ejemplo n.º 9
0
 def dataset_options(self):
     options = tf.data.Options()
     options.experimental_deterministic = not self._is_training
     options.experimental_optimization.map_parallelization = self._enable_map_parallelization
     options.experimental_optimization.parallel_batch = True
     options.threading.private_threadpool_size = max(
         2, (multiprocessing.cpu_count() // hvd.local_size()) - 2)
     return options
Ejemplo n.º 10
0
def read_tf_records(batch_size, tf_records, num_repeats=1,
                    shuffle_records=True, shuffle_examples=True,
                    shuffle_buffer_size=None, interleave=True,
                    filter_amount=1.0):
    """
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: One)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        interleave: iwhether to interleave examples from multiple tf_records
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    """
    if shuffle_examples and not shuffle_buffer_size:
        raise ValueError("Must set shuffle buffer size if shuffling examples")

    #tf_records = list(tf_records)
    # disable suffle for sharding
    #if shuffle_records:
    #    random.shuffle(tf_records)
    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    logging.info('hvd rank {}, local rank {}, size {}, shard_size {}'.format(hvd.rank(), hvd.local_rank(), hvd.size(), hvd.local_size()))
    record_list = record_list.shard(hvd.local_size(), hvd.local_rank())

    # compression_type here must agree with write_tf_examples
    map_func = functools.partial(
        tf.data.TFRecordDataset,
        buffer_size=8 * 1024 * 1024,
        compression_type='ZLIB')

    if interleave:
        # cycle_length = how many tfrecord files are read in parallel
        # The idea is to shuffle both the order of the files being read,
        # and the examples being read from the files.
        dataset = record_list.apply(tf.data.experimental.parallel_interleave(
            map_func, cycle_length=64, sloppy=True))
    else:
        dataset = record_list.flat_map(map_func)

    if filter_amount < 1.0:
        dataset = dataset.filter(
            lambda _: tf.random_uniform([]) < filter_amount)

    dataset = dataset.repeat(num_repeats)
    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    dataset = dataset.batch(batch_size)
    return dataset
Ejemplo n.º 11
0
    def _eval(self):
        logdir = args.logdir
        if cfg.TRAINER == 'replicated':
            # with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \
            with ThreadPoolExecutor(max_workers=self.num_predictor) as executor, \
                    tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar:
                futures = []
                for dataflow, pred in zip(self.dataflows, self.predictors):
                    futures.append(
                        executor.submit(eval_coco, dataflow, pred, pbar))
                all_results = list(
                    itertools.chain(*[fut.result() for fut in futures[:1]]))
                all_results_test = list(
                    itertools.chain(*[fut.result() for fut in futures[1:2]]))
        else:
            if self._horovod_run_eval:
                local_results = eval_coco(self.dataflow, self.predictor)
                output_partial = os.path.join(
                    logdir,
                    'outputs{}-part{}.json'.format(self.global_step,
                                                   hvd.local_rank()))
                with open(output_partial, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for k in range(hvd.local_size()):
                output_partial = os.path.join(
                    logdir,
                    'outputs{}-part{}.json'.format(self.global_step, k))
                with open(output_partial, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(output_partial)

        output_file = os.path.join(logdir,
                                   'outputs{}.json'.format(self.global_step))
        with open(output_file, 'w') as f:
            json.dump(all_results, f)
        try:
            scores = print_evaluation_scores(output_file)
            for k, v in scores.items():
                self.trainer.monitors.put_scalar(k, v)
        except Exception:
            logger.exception("Exception in COCO evaluation.")

        output_file_test = os.path.join(
            logdir, 'outputs-test{}.json'.format(self.global_step))
        with open(output_file_test, 'w') as f:
            json.dump(all_results_test, f)
Ejemplo n.º 12
0
def init_workers(distributed=False):
    if distributed and not no_horovod:
        hvd.init()
        assert hvd.mpi_threads_supported()
        from mpi4py import MPI
        assert hvd.size() == MPI.COMM_WORLD.Get_size()
        comm = MPI.COMM_WORLD
        print("Rank: {}, Size: {}".format(hvd.rank(), hvd.size()))
        return SimpleNamespace(rank=hvd.rank(), size=hvd.size(),
                                local_rank=hvd.local_rank(),
                                local_size=hvd.local_size(), comm=comm)
    else:
        print("not doing distributed")
        return SimpleNamespace(rank=0, size=1, local_rank=0, local_size=1, comm=None)
    def test_horovod_adasum_multiple_allreduce_gpu_nccl(self):
        """Test on GPU using NCCL that the Adasum correctly computes 2D tensors."""
        hvd.init()
        # TODO support non-MPI Adasum operation
        if not hvd.mpi_enabled() or not hvd.gpu_available(
                'tensorflow') or not hvd.nccl_built():
            self.skipTest("MPI, GPU or NCCL not available")

        rank = hvd.rank()
        rank_tensors = []
        size = hvd.size()
        # TODO support testing with non-power 2 ranks
        if not is_power2(size):
            self.skipTest("MPI rank is not power of 2")

        local_size = hvd.local_size()

        # Only run on homogeneous cluster
        if not hvd.is_homogeneous():
            self.skipTest("Horovod cluster is not homogeneous")

        num_nodes = int(size / local_size)
        for _ in range(size):
            rank_tensors.append([
                np.random.random_sample((2, 2)),
                np.random.random_sample((2, 2))
            ])
        sum_local_ranks_tensor = []
        for i in range(num_nodes):
            sum_local_ranks_tensor.append([np.zeros((2, 2)), np.zeros((2, 2))])
            for j in range(local_size):
                sum_local_ranks_tensor[i] = np.add(sum_local_ranks_tensor[i],
                                                   rank_tensors[j])

        answer = reference_tree_reduction(sum_local_ranks_tensor, num_nodes)
        answer = np.true_divide(answer, local_size)
        for dtype in [tf.float16, tf.float32, tf.float64]:
            with tf.device("/gpu:{}".format(hvd.local_rank())):
                tensors = map(tf.constant, rank_tensors[rank])
                # cast to the corresponding dtype
                tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors)
                # and away we go: do reduction
                reduced_tensors = [
                    self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum))
                    for tensor in tensors
                ]
                # cast expected result to the type of the tensorflow values
                np_type = dtype.as_numpy_dtype
                tmp = [t.astype(np_type) for t in answer]
                self.assertAllCloseAccordingToType(tmp, reduced_tensors)
Ejemplo n.º 14
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu * 2
            self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)]
            self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor)
                              for k in range(self.num_predictor)]
        else:
            # Only eval on the first machine.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_coco_predictor(0)
                self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Ejemplo n.º 15
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu * 2
            self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)]
            self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor)
                              for k in range(self.num_predictor)]
        else:
            # Only eval on the first machine.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_coco_predictor(0)
                self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Ejemplo n.º 16
0
    def compute_validation_metrics(self) -> Any:
        if self.trainer_type == "replicated":
            all_results = multithread_predict_dataflow(
                self.dataflows, self.predictors
            )  # type: ignore
        else:
            filenames = [
                os.path.join(
                    self._output_dir, "outputs{}-part{}.json".format(self.trainer.global_step, rank)
                )
                for rank in range(hvd.local_size())
            ]

            if self.machine_rank == 0:
                local_results = predict_dataflow(self.dataflow, self.predictor)
                fname = filenames[hvd.local_rank()]
                with open(fname, "w") as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for fname in filenames:
                with open(fname, "r") as f:
                    obj = json.load(f)
                all_results.extend(obj)

        output_file = os.path.join(
            self._output_dir,
            "{}-outputs{}-{}.json".format(
                self._eval_dataset, self.trainer.global_step, time.time()
            ),
        )

        metrics = DatasetRegistry.get(self._eval_dataset).eval_inference_results(  # type: ignore
            all_results, output_file
        )

        # If there are no detections, the metrics result is totally empty, instead of containing
        # zeroes. Ensure that the main evaluation metric has some value.
        metrics.setdefault("mAP(bbox)/IoU=0.5:0.95", 0)

        return metrics
Ejemplo n.º 17
0
    def _eval(self):
        logdir = args.logdir
        if cfg.TRAINER == 'replicated':
            with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \
                    tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar:
                futures = []
                for dataflow, pred in zip(self.dataflows, self.predictors):
                    futures.append(executor.submit(eval_coco, dataflow, pred, pbar))
                all_results = list(itertools.chain(*[fut.result() for fut in futures]))
        else:
            if self._horovod_run_eval:
                local_results = eval_coco(self.dataflow, self.predictor)
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank()))
                with open(output_partial, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for k in range(hvd.local_size()):
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, k))
                with open(output_partial, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(output_partial)

        output_file = os.path.join(
            logdir, 'outputs{}.json'.format(self.global_step))
        with open(output_file, 'w') as f:
            json.dump(all_results, f)
        try:
            scores = print_evaluation_scores(output_file)
            for k, v in scores.items():
                self.trainer.monitors.put_scalar(k, v)
        except Exception:
            logger.exception("Exception in COCO evaluation.")
Ejemplo n.º 18
0
def main(input_path_train, input_path_validation, downsampling_fact,
         downsampling_mode, channels, data_format, label_id, blocks, weights,
         image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, fs_type,
         optimizer, batch, batchnorm, num_epochs, dtype, chkpt, filter_sz,
         growth, disable_checkpoints, disable_imsave, tracing, trace_dir,
         output_sampling, scale_factor):

    #init horovod
    nvtx.RangePush("init horovod", 1)
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))
    nvtx.RangePop()  # init horovod

    #downsampling? recompute image dims
    image_height = image_height_orig // downsampling_fact
    image_width = image_width_orig // downsampling_fact

    #parameters
    per_rank_output = False
    loss_print_interval = 10

    #session config
    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=6,  #1
        intra_op_parallelism_threads=1,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)
    sess_config.gpu_options.force_gpu_compatible = True

    #get data
    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    trn_data = load_data(input_path_train, True, trn_sz, horovod)
    val_data = load_data(input_path_validation, False, val_sz, horovod)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("done.")

    #print some stats
    if comm_rank == 0:
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Batch normalization: {}".format(batchnorm))
        print("Blocks: {}".format(blocks))
        print("Growth rate: {}".format(growth))
        print("Filter size: {}".format(filter_sz))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Loss scale factor: {}".format(scale_factor))
        print("Output sampling target: {}".format(output_sampling))
        #print optimizer parameters
        for k, v in optimizer.items():
            print("Solver Parameters: {k}: {v}".format(k=k, v=v))
        #print("Optimizer type: {}".format(optimizer['opt_type']))
        print("Num training samples: {}".format(trn_data.shape[0]))
        print("Num validation samples: {}".format(val_data.shape[0]))
        print("Disable checkpoints: {}".format(disable_checkpoints))
        print("Disable image save: {}".format(disable_imsave))
        print("Downsampling factor: {}".format(downsampling_fact))
        print("Downsampling mode: {}".format(downsampling_mode))

    #compute epochs and stuff:
    if fs_type == "local":
        num_samples = trn_data.shape[0] // comm_local_size
    else:
        num_samples = trn_data.shape[0] // comm_size
    num_steps_per_epoch = num_samples // batch
    num_steps = num_epochs * num_steps_per_epoch
    if per_rank_output:
        print("Rank {} does {} steps per epoch".format(comm_rank,
                                                       num_steps_per_epoch))

    with training_graph.as_default():
        nvtx.RangePush("TF Init", 3)
        #create readers
        trn_reader = h5_input_reader(input_path_train,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     data_format=data_format,
                                     label_id=label_id,
                                     sample_target=output_sampling)
        val_reader = h5_input_reader(input_path_validation,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     data_format=data_format,
                                     label_id=label_id)
        #create datasets
        if fs_type == "local":
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=False)
        else:
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype, tf.string),
            ((batch, len(channels), image_height_orig,
              image_width_orig) if data_format == "channels_first" else
             (batch, image_height_orig, image_width_orig, len(channels)),
             (batch, image_height_orig, image_width_orig),
             (batch, image_height_orig, image_width_orig), (batch)))
        next_elem = iterator.get_next()

        #if downsampling, do some preprocessing
        if downsampling_fact != 1:
            if downsampling_mode == "scale":
                #do downsampling
                rand_select = tf.cast(tf.one_hot(tf.random_uniform(
                    (batch, image_height, image_width),
                    minval=0,
                    maxval=downsampling_fact * downsampling_fact,
                    dtype=tf.int32),
                                                 depth=downsampling_fact *
                                                 downsampling_fact,
                                                 axis=-1),
                                      dtype=tf.int32)
                next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \
                             tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1,1,1,1], 'VALID'), rand_select), axis=-1), \
                             tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \
                             next_elem[3])
            elif downsampling_mode == "center-crop":
                #some parameters
                length = 1. / float(downsampling_fact)
                offset = length / 2.
                boxes = [[offset, offset, offset + length, offset + length]
                         ] * batch
                box_ind = list(range(0, batch))
                crop_size = [image_height, image_width]

                #be careful with data order
                if data_format == "channels_first":
                    next_elem = (tf.transpose(next_elem[0], perm=[0, 2, 3, 1]),
                                 next_elem[1], next_elem[2], next_elem[3])

                #crop
                next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \
                             ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \
                             tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \
                             next_elem[3])

                #be careful with data order
                if data_format == "channels_first":
                    next_elem = (tf.transpose(next_elem[0], perm=[0, 3, 1, 2]),
                                 next_elem[1], next_elem[2], next_elem[3])

            elif downsampling_mode == "random-crop":
                #some parameters
                crop_size = [
                    batch, image_height, image_width,
                    len(channels) + 2
                ]

                #concatenate input, crop, split apart
                crop_input = tf.concat([next_elem[0] if data_format=="channels_last" else tf.transpose(next_elem[0], perm=[0,2,3,1]), \
                                        ensure_type(tf.expand_dims(next_elem[1], axis=-1), tf.float32), \
                                        tf.expand_dims(next_elem[2], axis=-1)], \
                                       axis = -1)
                crop_output = tf.image.random_crop(crop_input, crop_size)

                #restore iterator output
                crop_image = crop_output[:, :, :, :len(channels)]
                crop_label = ensure_type(crop_output[:, :, :,
                                                     len(channels)], tf.int32)
                crop_weight = crop_output[:, :, :, len(channels) + 1]
                next_elem = (crop_image if data_format=="channels_last" else tf.transpose(crop_image, perm=[0,3,1,2]), \
                             crop_label, crop_weight, next_elem[3])

            else:
                raise ValueError(
                    "Error, downsampling mode {} not supported. Supported are [center-crop, random-crop, scale]"
                    .format(downsampling_mode))

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #compute the input filter number based on number of channels used
        num_channels = len(channels)
        nb_filter = 64

        #set up model
        logit, prediction = create_tiramisu(3,
                                            next_elem[0],
                                            image_height,
                                            image_width,
                                            num_channels,
                                            loss_weights=weights,
                                            nb_layers_per_block=blocks,
                                            p=0.2,
                                            wd=1e-4,
                                            dtype=dtype,
                                            batchnorm=batchnorm,
                                            growth_rate=growth,
                                            nb_filter=nb_filter,
                                            filter_sz=filter_sz,
                                            median_filter=False,
                                            data_format=data_format)
        #prediction_argmax = median_pool(prediction_argmax, 3, strides=[1,1,1,1])

        #set up loss
        loss = None
        if loss_type == "weighted":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
            if scale_factor != 1.0:
                loss *= scale_factor
        elif loss_type == "focal":
            labels_one_hot = tf.contrib.layers.one_hot_encoding(
                next_elem[1], 3)
            labels_one_hot = ensure_type(labels_one_hot, dtype)
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)
        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))

        #determine flops
        flops = graph_flops.graph_flops(
            format="NHWC" if data_format == "channels_last" else "NCHW",
            batch=batch,
            sess_config=sess_config)
        flops *= comm_size
        if comm_rank == 0:
            print('training flops: {:.3f} TF/step'.format(flops * 1e-12))

        #number of trainable parameters
        if comm_rank == 0:
            num_params = get_number_of_trainable_parameters()
            print('number of trainable parameters: {} ({} MB)'.format(
                num_params,
                num_params * (4 if dtype == tf.float32 else 2) * (2**-20)))

        if horovod:
            loss_avg = hvd.allreduce(ensure_type(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)

        #set up global step - keep on CPU
        with tf.device('/device:CPU:0'):
            global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if optimizer['opt_type'].startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op, lr = get_larc_optimizer(optimizer, loss, global_step,
                                              num_steps_per_epoch, horovod)
        else:
            train_op, lr = get_optimizer(optimizer, loss, global_step,
                                         num_steps_per_epoch, horovod)

        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)

        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        #bcast init for bcasting the model after start
        if horovod:
            init_bcast = hvd.broadcast_global_variables(0)

        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = 5 * num_steps_per_epoch
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            if (not disable_checkpoints):
                hooks.append(
                    tf.train.CheckpointSaverHook(
                        checkpoint_dir=checkpoint_dir,
                        save_steps=checkpoint_save_freq,
                        saver=checkpoint_saver))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        if tracing is not None:
            import tracehook
            tracing_hook = tracehook.TraceHook(steps_to_trace=tracing,
                                               cache_traces=True,
                                               trace_dir=trace_dir)
            hooks.append(tracing_hook)

        # instead of averaging losses over an entire epoch, use a moving
        #  window average
        recent_losses = []
        loss_window_size = 10

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])
            #restore from checkpoint:
            if comm_rank == 0 and not disable_checkpoints:
                load_model(sess, checkpoint_saver, checkpoint_dir)
            #broadcast loaded model variables
            if horovod:
                sess.run(init_bcast)
            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string])
            #init iterators
            sess.run(trn_init_op, feed_dict={handle: trn_handle})
            sess.run(val_init_op, feed_dict={handle: val_handle})

            nvtx.RangePop()  # TF Init

            # figure out what step we're on (it won't be 0 if we are
            #  restoring from a checkpoint) so we can count from there
            train_steps = sess.run([global_step])[0]

            #do the training
            epoch = 1
            step = 1

            t_sustained_start = time.time()

            nvtx.RangePush("Training Loop", 4)
            nvtx.RangePush("Epoch", epoch)
            start_time = time.time()
            while not sess.should_stop():

                #training loop
                try:
                    nvtx.RangePush("Step", step)
                    #construct feed dict
                    t_inst_start = time.time()
                    _, tmp_loss = sess.run(
                        [train_op, (loss if per_rank_output else loss_avg)],
                        feed_dict={handle: trn_handle})
                    t_inst_end = time.time()
                    train_steps += 1
                    train_steps_in_epoch = train_steps % num_steps_per_epoch
                    recent_losses = [tmp_loss
                                     ] + recent_losses[0:loss_window_size - 1]
                    train_loss = sum(recent_losses) / len(recent_losses)
                    nvtx.RangePop()  # Step
                    step += 1

                    #print step report
                    eff_steps = train_steps_in_epoch if (
                        train_steps_in_epoch > 0) else num_steps_per_epoch
                    if (train_steps % loss_print_interval) == 0:
                        if per_rank_output:
                            print(
                                "REPORT: rank {}, training loss for step {} (of {}) is {}, time {:.3f}"
                                .format(comm_rank, train_steps, num_steps,
                                        train_loss,
                                        time.time() - start_time))
                        else:
                            if comm_rank == 0:
                                print(
                                    "REPORT: training loss for step {} (of {}) is {}, time {:.3f}, r_inst {:.3f}"
                                    .format(
                                        train_steps, num_steps, train_loss,
                                        time.time() - start_time, 1e-12 *
                                        flops / (t_inst_end - t_inst_start)))

                    #do the validation phase
                    if train_steps_in_epoch == 0:
                        end_time = time.time()
                        #print epoch report
                        if per_rank_output:
                            print(
                                "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}"
                                .format(
                                    comm_rank, epoch, num_epochs, train_loss,
                                    time.time() - start_time,
                                    1e-12 * flops * num_steps_per_epoch /
                                    (end_time - t_sustained_start)))
                        else:
                            if comm_rank == 0:
                                print(
                                    "COMPLETED: training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}"
                                    .format(
                                        epoch, num_epochs, train_loss,
                                        time.time() - start_time,
                                        1e-12 * flops * num_steps_per_epoch /
                                        (end_time - t_sustained_start)))

                        #evaluation loop
                        eval_loss = 0.
                        eval_steps = 0
                        nvtx.RangePush("Eval Loop", 7)
                        while True:
                            try:
                                #construct feed dict
                                _, tmp_loss, val_model_predictions, val_model_labels, val_model_filenames = sess.run(
                                    [
                                        iou_update_op,
                                        (loss
                                         if per_rank_output else loss_avg),
                                        prediction, next_elem[1], next_elem[3]
                                    ],
                                    feed_dict={handle: val_handle})

                                #print some images
                                if comm_rank == 0 and not disable_imsave:
                                    if have_imsave:
                                        imsave(
                                            image_dir + '/test_pred_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png',
                                            np.argmax(
                                                val_model_predictions[0, ...],
                                                axis=2) * 100)
                                        imsave(
                                            image_dir + '/test_label_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png',
                                            val_model_labels[0, ...] * 100)
                                        imsave(
                                            image_dir +
                                            '/test_combined_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.png', colormap[
                                                val_model_labels[0, ...],
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2)])
                                    else:
                                        np.savez(
                                            image_dir + '/test_epoch' +
                                            str(epoch) + '_estep' +
                                            str(eval_steps) + '_rank' +
                                            str(comm_rank) + '.npz',
                                            prediction=np.argmax(
                                                val_model_predictions[0, ...],
                                                axis=2) * 100,
                                            label=val_model_labels[0, ...] *
                                            100,
                                            filename=val_model_filenames[0])

                                eval_loss += tmp_loss
                                eval_steps += 1
                            except tf.errors.OutOfRangeError:
                                eval_steps = np.max([eval_steps, 1])
                                eval_loss /= eval_steps
                                if per_rank_output:
                                    print(
                                        "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}"
                                        .format(comm_rank, epoch, num_epochs,
                                                eval_loss))
                                else:
                                    if comm_rank == 0:
                                        print(
                                            "COMPLETED: evaluation loss for epoch {} (of {}) is {}"
                                            .format(epoch, num_epochs,
                                                    eval_loss))
                                if per_rank_output:
                                    iou_score = sess.run(iou_op)
                                    print(
                                        "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}"
                                        .format(comm_rank, epoch, num_epochs,
                                                iou_score))
                                else:
                                    iou_score = sess.run(iou_avg)
                                    if comm_rank == 0:
                                        print(
                                            "COMPLETED: evaluation IoU for epoch {} (of {}) is {}"
                                            .format(epoch, num_epochs,
                                                    iou_score))
                                sess.run(iou_reset_op)
                                sess.run(val_init_op,
                                         feed_dict={handle: val_handle})
                                break
                        nvtx.RangePop()  # Eval Loop

                        #reset counters
                        epoch += 1
                        step = 0
                        t_sustained_start = time.time()

                        nvtx.RangePop()  # Epoch
                        nvtx.RangePush("Epoch", epoch)

                except tf.errors.OutOfRangeError:
                    break

            nvtx.RangePop()  # Epoch
            nvtx.RangePop()  # Training Loop

        # write any cached traces to disk
        if tracing is not None:
            tracing_hook.write_traces()
Ejemplo n.º 19
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.horovod:
        hvd.init()
    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    validate_flags_or_throw(bert_config)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
    hvd_rank = 0
    hvd_local_rank = 0

    config = tf.ConfigProto()
    learning_rate = FLAGS.learning_rate
    if FLAGS.horovod:

        tf.logging.info("Multi-GPU training with TF Horovod")
        tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(),
                        hvd.rank())
        global_batch_size = FLAGS.train_batch_size * hvd.size(
        ) * FLAGS.num_accumulation_steps
        learning_rate = learning_rate * hvd.size()
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        hvd_local_rank = hvd.local_rank()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.logging.info("***** Configuaration *****")
        for key in FLAGS.__flags.keys():
            tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
        tf.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(
        LogTrainRunHook(global_batch_size, hvd_rank,
                        FLAGS.save_checkpoints_steps))

    # Prepare Training Data
    if FLAGS.do_train:
        train_examples = read_squad_examples(
            input_file=FLAGS.train_file,
            is_training=True,
            version_2_with_negative=FLAGS.version_2_with_negative)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Pre-shuffle the input to avoid having to make a very large shuffle
        # buffer in in the `input_fn`.
        rng = random.Random(12345)
        rng.shuffle(train_examples)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.local_size())
            ]
            num_examples_per_local_rank = len(
                train_examples) // hvd.local_size()
            remainder = len(train_examples) % hvd.local_size()
            if hvd.local_rank() < remainder:
                start_index = hvd.local_rank() * (num_examples_per_local_rank +
                                                  1)
                end_index = start_index + num_examples_per_local_rank + 1
            else:
                start_index = hvd.local_rank(
                ) * num_examples_per_local_rank + remainder
                end_index = start_index + (num_examples_per_local_rank)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                hvd=None if not FLAGS.horovod else hvd,
                                use_fp16=FLAGS.use_fp16)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:

        # We write to a temporary file to avoid storing very large constant tensors
        # in memory.
        train_writer = FeatureWriter(filename=tmp_filenames[hvd_local_rank],
                                     is_training=True)
        convert_examples_to_features(
            examples=train_examples[start_index:end_index],
            tokenizer=tokenizer,
            max_seq_length=FLAGS.max_seq_length,
            doc_stride=FLAGS.doc_stride,
            max_query_length=FLAGS.max_query_length,
            is_training=True,
            output_fn=train_writer.process_feature,
            verbose_logging=FLAGS.verbose_logging)
        train_writer.close()

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num orig examples = %d", end_index - start_index)
        tf.logging.info("  Num split examples = %d", train_writer.num_features)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        tf.logging.info("  LR = %f", learning_rate)
        del train_examples
        if FLAGS.horovod:
            barrier = hvd.allreduce(tf.constant(0))
            with tf.Session(config=config) as sess:
                sess.run(barrier)

        train_input_fn = input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=num_train_steps)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.logging.info("-----------------------------")
            tf.logging.info("Total Training Time = %0.2f for Sentences = %d",
                            train_time_elapsed,
                            num_train_steps * global_batch_size)
            tf.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.logging.info("Throughput Average (sentences/sec) = %0.2f",
                            ss_sentences_per_second)
            tf.logging.info("-----------------------------")

    if FLAGS.export_trtis and master_process:
        export_model(estimator, FLAGS.output_dir, FLAGS.init_checkpoint)

    if FLAGS.do_predict and master_process:
        eval_examples = read_squad_examples(
            input_file=FLAGS.predict_file,
            is_training=False,
            version_2_with_negative=FLAGS.version_2_with_negative)

        # Perform evaluation on subset, useful for profiling
        if FLAGS.num_eval_iterations is not None:
            eval_examples = eval_examples[:FLAGS.num_eval_iterations *
                                          FLAGS.predict_batch_size]

        eval_writer = FeatureWriter(filename=os.path.join(
            FLAGS.output_dir, "eval.tf_record"),
                                    is_training=False)
        eval_features = []

        def append_feature(feature):
            eval_features.append(feature)
            eval_writer.process_feature(feature)

        convert_examples_to_features(examples=eval_examples,
                                     tokenizer=tokenizer,
                                     max_seq_length=FLAGS.max_seq_length,
                                     doc_stride=FLAGS.doc_stride,
                                     max_query_length=FLAGS.max_query_length,
                                     is_training=False,
                                     output_fn=append_feature,
                                     verbose_logging=FLAGS.verbose_logging)
        eval_writer.close()

        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Num orig examples = %d", len(eval_examples))
        tf.logging.info("  Num split examples = %d", len(eval_features))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = input_fn_builder(
            input_file=eval_writer.filename,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        all_results = []
        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        eval_start_time = time.time()
        for result in estimator.predict(predict_input_fn,
                                        yield_single_examples=True,
                                        hooks=eval_hooks):
            if len(all_results) % 1000 == 0:
                tf.logging.info("Processing example: %d" % (len(all_results)))
            unique_id = int(result["unique_ids"])
            start_logits = [float(x) for x in result["start_logits"].flat]
            end_logits = [float(x) for x in result["end_logits"].flat]
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

        eval_time_elapsed = time.time() - eval_start_time
        eval_time_wo_overhead = eval_hooks[-1].total_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        num_sentences = (eval_hooks[-1].count -
                         eval_hooks[-1].skipped) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.logging.info("-----------------------------")
        tf.logging.info("Total Inference Time = %0.2f for Sentences = %d",
                        eval_time_elapsed,
                        eval_hooks[-1].count * FLAGS.predict_batch_size)
        tf.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead,
            (eval_hooks[-1].count - eval_hooks[-1].skipped) *
            FLAGS.predict_batch_size)
        tf.logging.info("Summary Inference Statistics")
        tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
        tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                        cf_50 * 1000)
        tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                        cf_90 * 1000)
        tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                        cf_95 * 1000)
        tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                        cf_99 * 1000)
        tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                        cf_100 * 1000)
        tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.logging.info("Throughput Average (sentences/sec) = %0.2f",
                        ss_sentences_per_second)
        tf.logging.info("-----------------------------")

        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(FLAGS.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                 "null_odds.json")

        write_predictions(eval_examples, eval_features, all_results,
                          FLAGS.n_best_size, FLAGS.max_answer_length,
                          FLAGS.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file)
Ejemplo n.º 20
0
        print('Unable to call \n'
              '`tf.compat.v1.enable_resource_variables()`. Continuing...')

try:
    import horovod
    import horovod.tensorflow as hvd
    try:
        RANK = hvd.rank()
    except ValueError:
        hvd.init()

    RANK = hvd.rank()
    SIZE = hvd.size()
    HAS_HOROVOD = True
    IS_CHIEF = (RANK == 0)
    LOCAL_SIZE = hvd.local_size()
    LOCAL_RANK = hvd.local_rank()
    #  logging.info(f'using horovod from: {horovod.__file__}')
    #  logging.info(f'using horovod version: {horovod.__version__}')
    prefix = f'{RANK} / {SIZE} ::'
    if IS_CHIEF:
        print(80 * '=')
        print(f'{prefix} Using tensorflow version: {tf.__version__}')
        print(f'{prefix} Using tensorflow from: {tf.__file__}')
        print(f'{prefix} Using horovod version: {horovod.__version__}')
        print(f'{prefix} Using horovod from: {horovod.__file__}')
        print(80 * '=')
    else:
        print(f'Hello, im rank: {RANK} of {SIZE} total ranks')

    GPUS = tf.config.experimental.list_physical_devices('GPU')
Ejemplo n.º 21
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    lr_scaler = hvd.size()
    # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
    # scale lr by local_size
    if args.use_adasum:
        lr_scaler = hvd.local_size() if hvd.nccl_built() else 1

    # Horovod: adjust learning rate based on lr_scaler.
    opt = tf.train.AdamOptimizer(args.lr * lr_scaler)

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(
        opt, op=hvd.Adasum if args.use_adasum else hvd.Average)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=args.num_steps // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
Ejemplo n.º 22
0
def main():
    ''' simple starter program for tensorflow models. '''
    logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s'
    logging_datefmt = '%Y-%m-%d %H:%M:%S'
    logging_level = logging.INFO

    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        '-c',
        '--config',
        dest='config_filename',
        help='configuration filename in json format [default: %s]' %
        DEFAULT_CONFIG,
        default=DEFAULT_CONFIG)
    parser.add_argument(
        '--interop',
        type=int,
        help=
        'set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]'
        % DEFAULT_INTEROP,
        default=DEFAULT_INTEROP)
    parser.add_argument(
        '--intraop',
        type=int,
        help=
        'set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]'
        % DEFAULT_INTRAOP,
        default=DEFAULT_INTRAOP)
    parser.add_argument(
        '-l',
        '--logdir',
        default=DEFAULT_LOGDIR,
        help='define location to save log information [default: %s]' %
        DEFAULT_LOGDIR)

    parser.add_argument('--horovod',
                        dest='horovod',
                        default=False,
                        action='store_true',
                        help="Use horovod")

    parser.add_argument('--debug',
                        dest='debug',
                        default=False,
                        action='store_true',
                        help="Set Logger to DEBUG")
    parser.add_argument('--error',
                        dest='error',
                        default=False,
                        action='store_true',
                        help="Set Logger to ERROR")
    parser.add_argument('--warning',
                        dest='warning',
                        default=False,
                        action='store_true',
                        help="Set Logger to ERROR")
    parser.add_argument('--logfilename',
                        dest='logfilename',
                        default=None,
                        help='if set, logging information will go to file')
    args = parser.parse_args()

    hvd = None
    if args.horovod:
        import horovod
        import horovod.tensorflow as hvd
        hvd.init()
        logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + (
            '%05d' % hvd.rank()) + ':%(name)s:%(message)s'

        if hvd.rank() > 0:
            logging_level = logging.WARNING

    if args.debug and not args.error and not args.warning:
        logging_level = logging.DEBUG
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0'
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
    elif not args.debug and args.error and not args.warning:
        logging_level = logging.ERROR
    elif not args.debug and not args.error and args.warning:
        logging_level = logging.WARNING

    logging.basicConfig(level=logging_level,
                        format=logging_format,
                        datefmt=logging_datefmt,
                        filename=args.logfilename)

    if 'CUDA_VISIBLE_DEVICES' in os.environ:
        logging.warning('CUDA_VISIBLE_DEVICES=%s %s',
                        os.environ['CUDA_VISIBLE_DEVICES'],
                        device_lib.list_local_devices())
    else:
        logging.info('CUDA_VISIBLE_DEVICES not defined in os.environ')
    logging.info('using tensorflow version:   %s', tf.__version__)
    logging.info('using tensorflow from:      %s', tf.__file__)
    if hvd:
        logging.warning(
            'rank: %5d   size: %5d  local rank: %5d  local size: %5d',
            hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size())

        logging.info('using horovod version:      %s', horovod.__version__)
        logging.info('using horovod from:         %s', horovod.__file__)
    logging.info('logdir:                     %s', args.logdir)
    logging.info('interop:                    %s', args.interop)
    logging.info('intraop:                    %s', args.intraop)

    device_str = '/CPU:0'
    if tf.test.is_gpu_available():
        # device_str = '/device:GPU:' + str(hvd.local_rank())
        gpus = tf.config.experimental.list_logical_devices('GPU')
        logger.warning('gpus = %s', gpus)
        # assert hvd.local_rank() < len(gpus), f'localrank = {hvd.local_rank()} len(gpus) = {len(gpus)}'
        device_str = gpus[0].name
        # logger.info('device_str = %s',device_str)

    logger.warning('device:                     %s', device_str)

    config = json.load(open(args.config_filename))
    config['device'] = device_str
    config['hvd'] = hvd

    logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
    logger.info('%s = \n %s', args.config_filename,
                json.dumps(config, indent=4, sort_keys=True))
    logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
    config['hvd'] = hvd

    with tf.Graph().as_default():
        logger.info('getting datasets')
        trainds, validds = data_handler.get_datasets(config)

        input_shape = (config['data']['batch_size'], ) + tuple(
            config['data']['image_shape'])
        target_shape = (config['data']['batch_size'],
                        config['data']['image_shape'][0])

        iterator = tf.compat.v1.data.Iterator.from_structure(
            (tf.float32, tf.int32), (input_shape, target_shape))
        input, target = iterator.get_next()
        training_init_op = iterator.make_initializer(trainds)
        valid_init_op = iterator.make_initializer(validds)

        with tf.device(device_str):

            is_training_pl = tf.compat.v1.placeholder(tf.bool, shape=())
            batch = tf.Variable(0)
            batch_size = tf.constant(config['data']['batch_size'])

            pred, endpoints = model.get_model(input, is_training_pl, config)
            logger.info('pred = %s  target = %s', pred.shape, target.shape)
            # pred = BxC, target = BxC
            loss = losses.get_loss(config)(labels=target, logits=pred)
            #tf.compat.v1.summary.scalar('loss/combined',loss)

            #learning_rate = pointnet_seg.get_learning_rate(batch,config) * hvd.size()
            learning_rate = lr_func.get_learning_rate(batch * batch_size,
                                                      config)
            tf.compat.v1.summary.scalar('learning_rate', learning_rate)
            if config['optimizer']['name'] == 'adam':
                optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)

            # adding Horovod distributed optimizer
            if hvd:
                optimizer = hvd.DistributedOptimizer(optimizer)

            # create the training operator
            train_op = optimizer.minimize(loss, global_step=batch)

            # Add ops to save and restore all the variables.
            saver = tf.compat.v1.train.Saver()

            merged = tf.compat.v1.summary.merge_all()

        logger.info('create session')

        config_proto = tf.compat.v1.ConfigProto()
        if 'gpu' in device_str:
            config_proto.gpu_options.allow_growth = True
            config_proto.gpu_options.visible_device_list = os.environ[
                'CUDA_VISIBLE_DEVICES']
        else:
            config_proto.allow_soft_placement = True
            config_proto.intra_op_parallelism_threads = args.intraop
            config_proto.inter_op_parallelism_threads = args.interop

        # Initialize an iterator over a dataset with 10 elements.
        sess = tf.compat.v1.Session(config=config_proto)

        # create tensorboard writers
        if hvd and hvd.rank() == 0:
            train_writer = tf.compat.v1.summary.FileWriter(
                os.path.join(args.logdir, 'train'), sess.graph)
            valid_writer = tf.compat.v1.summary.FileWriter(
                os.path.join(args.logdir, 'valid'), sess.graph)

        # initialize global vars and horovod broadcast initial model
        init = tf.compat.v1.global_variables_initializer()
        sess.run(init, {is_training_pl: True})
        if hvd:
            sess.run(hvd.broadcast_global_variables(0))

        logger.info('running over data')
        status_interval = config['training']['status']
        loss_sum = 0.
        for epoch in range(config['training']['epochs']):
            logger.info('epoch %s of %s', epoch + 1,
                        config['training']['epochs'])

            # initialize the data iterator for training loop
            sess.run(training_init_op)

            # training loop
            start = time.time()
            while True:
                try:
                    # set that we are training
                    feed_dict = {is_training_pl: True}
                    summary, step, _, loss_val = sess.run(
                        [merged, batch, train_op, loss], feed_dict=feed_dict)

                    # report status periodically
                    if step % status_interval == 0:
                        end = time.time()
                        duration = end - start
                        logger.info(
                            'step: %10d    imgs/sec: %10.6f', step,
                            float(status_interval) *
                            config['data']['batch_size'] / duration)
                        start = time.time()

                # exception thrown when data is done
                except tf.errors.OutOfRangeError:
                    logger.info(' end of epoch ')
                    saver.save(sess,
                               os.path.join(args.logdir, "model.ckpt"),
                               global_step=step)
                    break

            logger.info('running validation')
            # initialize the validation data iterator
            sess.run(valid_init_op)

            steps = 0.
Ejemplo n.º 23
0
        sys.path.insert(i, p)
        i += 1
print()

try:
    from mpi4py import MPI
    name = MPI.Get_processor_name()
    comm = MPI.COMM_WORLD
    print("mpi4py:", "name: %s," % name, "rank: %i," % comm.Get_rank(),
          "size: %i" % comm.Get_size())
    hosts = comm.allgather(
        (comm.Get_rank(), name))  # Get the names of all the other hosts
    print("  all hosts:", {key: item for (key, item) in hosts})
except ImportError:
    print("mpi4py not available")

print("Import TF now...")
import tensorflow as tf
print("TF version:", tf.__version__)

import horovod
print("Horovod version:", horovod.__version__)
import horovod.tensorflow as hvd

# Initialize Horovod
hvd.init()

print(
    "pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" %
    (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
def log_csv(model, batch_size, device, num_devices, num_devices_per_node,
            disable_ib, disable_nccl_p2p, img_sec_mean, img_sec_conf,
            total_img_sec_mean, total_img_sec_conf):
    if hvd.rank() != 0:
        return
    with open('/var/scratch/sdhar/logs/tensorflow_synthetic.csv',
              'a',
              newline='') as f:
        csvwriter = csv.writer(f, lineterminator="\n")
        csvwriter.writerow([
            model, batch_size, device, num_devices, num_devices_per_node,
            disable_ib, disable_nccl_p2p, img_sec_mean, img_sec_conf,
            total_img_sec_mean, total_img_sec_conf
        ])


log_csv(
    args.model,
    str(args.batch_size),
    device,
    str(hvd.size()),
    str(hvd.local_size()),
    #Disable infiniband
    str(args.disable_ib),
    #Disable NCCL P2P Communication
    str(args.disable_p2p),
    str(img_sec_mean),
    str(img_sec_conf),
    str(hvd.size() * img_sec_mean),
    str(hvd.size() * img_sec_conf))
Ejemplo n.º 25
0
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    config.gpu_options.allow_growth = False
    config.gpu_options.visible_device_list = ''

if args.eager:
    tf.enable_eager_execution(config)

# Set up standard model.
model = getattr(applications, args.model)(weights=None)

lr_scaler = hvd.size()
# By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
# scale lr by local_size
if args.use_adasum:
    lr_scaler = hvd.local_size() if args.cuda and hvd.nccl_built() else 1

opt = tf.train.GradientDescentOptimizer(0.01 * lr_scaler)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
opt = hvd.DistributedOptimizer(
    opt,
    compression=compression,
    op=hvd.Adasum if args.use_adasum else hvd.Average)

init = tf.global_variables_initializer()
bcast_op = hvd.broadcast_global_variables(0)
Ejemplo n.º 26
0
def main():
   ''' simple starter program for tensorflow models. '''
   parser = argparse.ArgumentParser(description='')
   parser.add_argument('-c','--config',dest='config_filename',help='configuration filename in json format [default: %s]' % DEFAULT_CONFIG,default=DEFAULT_CONFIG)
   parser.add_argument('--interop',type=int,help='set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTEROP,default=DEFAULT_INTEROP)
   parser.add_argument('--intraop',type=int,help='set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTRAOP,default=DEFAULT_INTRAOP)
   parser.add_argument('-l','--logdir',default=DEFAULT_LOGDIR,help='define location to save log information [default: %s]' % DEFAULT_LOGDIR)

   parser.add_argument('--horovod', default=False, action='store_true', help="Use MPI with horovod")
   parser.add_argument('--profiler',default=False, action='store_true', help='Use TF profiler, needs CUPTI in LD_LIBRARY_PATH for Cuda')
   parser.add_argument('--profrank',default=0,type=int,help='set which rank to profile')

   parser.add_argument('--batch-term',dest='batch_term',type=int,help='if set, terminates training after the specified number of batches',default=0)

   parser.add_argument('--evaluate',help='evaluate a pre-trained model file on the test data set only.')
   parser.add_argument('--train-more',dest='train_more',help='load a pre-trained model file and continue training.')

   parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG")
   parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR")
   parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR")
   parser.add_argument('--logfilename',dest='logfilename',default=None,help='if set, logging information will go to file')
   args = parser.parse_args()
   
   hvd = None
   rank = 0
   nranks = 1
   logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s'
   logging_datefmt = '%Y-%m-%d %H:%M:%S'
   logging_level = logging.INFO
   if args.horovod:
      print('importing horovod')
      sys.stdout.flush()
      sys.stderr.flush()

      import horovod
      import horovod.tensorflow as hvd
      hvd.init()
      logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + (
                 '%05d' % hvd.rank()) + ':%(name)s:%(message)s'
      rank = hvd.rank()
      nranks = hvd.size()
      if rank > 0:
         logging_level = logging.WARNING

   # Setup Logging
   if args.debug and not args.error and not args.warning:
      logging_level = logging.DEBUG
      os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0'
      os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
   elif not args.debug and args.error and not args.warning:
      logging_level = logging.ERROR
   elif not args.debug and not args.error and args.warning:
      logging_level = logging.WARNING

   logging.basicConfig(level=logging_level,
                       format=logging_format,
                       datefmt=logging_datefmt,
                       filename=args.logfilename)
   
   if hvd:
      logging.warning('host: %s rank: %5d   size: %5d  local rank: %5d  local size: %5d',
                      socket.gethostname(),hvd.rank(), hvd.size(),
                      hvd.local_rank(), hvd.local_size())
   
   tf.config.threading.set_inter_op_parallelism_threads(args.interop)
   tf.config.threading.set_intra_op_parallelism_threads(args.intraop)

   # Setup GPUs
   gpus = tf.config.list_physical_devices('GPU')
   logger.info(   'number of gpus:              %s',len(gpus))
   for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
   if hvd and len(gpus) > 0:
      tf.config.set_visible_devices(gpus[hvd.local_rank() % len(gpus)],'GPU')

   logging.info(   'using tensorflow version:   %s (%s)',tf.__version__,tf.__git_version__)
   logging.info(   'using tensorflow from:      %s',tf.__file__)
   if hvd:
      logging.info('using horovod version:      %s',horovod.__version__)
      logging.info('using horovod from:         %s',horovod.__file__)
   logging.info(   'logdir:                     %s',args.logdir)
   logging.info(   'interop:                    %s',args.interop)
   logging.info(   'intraop:                    %s',args.intraop)
   
   # this must be created after the config settings
   gtape = tf.GradientTape()
   if args.horovod:
      gtape = hvd.DistributedGradientTape(gtape)

   config = json.load(open(args.config_filename))
   # config['device'] = device_str
   
   config['profrank'] = args.profrank
   config['profiler'] = args.profiler
   config['logdir'] = args.logdir
   config['rank'] = rank
   config['nranks'] = nranks
   config['evaluate'] = False
   config['batch_term'] = args.batch_term
   if args.batch_term > 0:
      config['training']['epochs'] = 1
      config['training']['status'] = 1 if args.batch_term < config['training']['status'] else config['training']['status']

   if args.evaluate is not None:
      config['evaluate'] = True
      config['model_file'] = args.evaluate
      config['training']['epochs'] = 1
      logger.info('evaluating model file:      %s',args.evaluate)
   elif args.train_more is not None:
      config['train_more'] = True
      config['model_file'] = args.train_more
      logger.info('continuing model file:      %s',args.train_more)


   # using mixed precision?
   if isinstance(config['model']['mixed_precision'],str):
      logger.info('using mixed precsion:       %s',config['model']['mixed_precision'])
      tf.keras.mixed_precision.set_global_policy(config['model']['mixed_precision'])

   logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
   logger.info('%s = \n %s',args.config_filename,json.dumps(config,indent=4,sort_keys=True))
   logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
   config['hvd'] = hvd

   sys.stdout.flush()
   sys.stderr.flush()

   trainds,testds = data_handler.get_datasets(config)
   
   logger.info('get model')
   net = model.get_model(config)
   loss_func = losses.get_loss(config)
   opt = get_optimizer(config)
   if isinstance(config['model']['mixed_precision'],str):
      opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)

   # initialize and create the model
   # input_shape = [config['data']['batch_size'],config['data']['num_points'],config['data']['num_features']]
   # output = net(tf.random.uniform(input_shape))

   # load previous model weights
   if args.evaluate:
      net.load_weights(args.evaluate)
   elif args.train_more:
      net.load_weights(args.train_more)

   # # synchronize models across ranks
   # if hvd:
   #    hvd.broadcast_variables(net.variables, root_rank=0)
   #    hvd.broadcast_variables(opt.variables(), root_rank=0)

   train_summary_writer = None
   test_summary_writer = None
   test_jet_writer = None
   test_ele_writer = None
   test_bkg_writer = None
   test_mean_writer = None
   if rank == 0:
      train_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'train')
      test_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'test')
      
      test_jet_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'jet_iou')
      test_ele_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'ele_iou')
      test_bkg_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'bkg_iou')
      test_mean_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'mean_iou')

      #tf.keras.utils.plot_model(net, "network_model.png", show_shapes=True)
      
      #with train_summary_writer.as_default():
        #tf.summary.graph(train_step.get_concrete_function().graph)

   batches_per_epoch = 0
   train_mIoU_sum = 0.
   test_mIoU_sum = 0.
   for epoch_num in range(config['training']['epochs']):
      
      logger.info('begin epoch %s',epoch_num)

      if not config['evaluate']:
         train_output = epoch_loop.one_train_epoch(config,trainds,net,
                                                   loss_func,opt,epoch_num,
                                                   train_summary_writer,
                                                   batches_per_epoch,
                                                   gtape)
         batches_per_epoch = train_output['batches_per_epoch']
         train_mIoU_sum += train_output['mIoU']
         logger.info('train mIoU sum: %10.4f',train_mIoU_sum / (epoch_num + 1))

      test_output = epoch_loop.one_eval_epoch(config,testds,net,
                                              loss_func,opt,epoch_num,
                                              test_summary_writer,
                                              batches_per_epoch,
                                              test_jet_writer,
                                              test_ele_writer,
                                              test_bkg_writer,
                                              test_mean_writer)
      test_mIoU_sum += test_output['mIoU']
      logger.info('test mIoU sum: %10.4f',test_mIoU_sum / (epoch_num + 1))

      if rank == 0:
         with test_summary_writer.as_default():
            step = (epoch_num + 1) * batches_per_epoch
            tf.summary.scalar('metrics/mIoU_AOC', test_mIoU_sum / (epoch_num + 1),step=step)
def main(device, input_path_train, input_path_validation, dummy_data,
         downsampling_fact, downsampling_mode, channels, data_format, label_id,
         weights, image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, model,
         decoder, fs_type, optimizer, batch, batchnorm, num_epochs, dtype,
         disable_checkpoints, disable_imsave, tracing, trace_dir,
         output_sampling, scale_factor, intra_threads, inter_threads):
    #init horovod
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))

    #downsampling? recompute image dims
    image_height = image_height_orig // downsampling_fact
    image_width = image_width_orig // downsampling_fact

    #parameters
    per_rank_output = False
    loss_print_interval = 1

    #session config
    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=inter_threads,  #6
        intra_op_parallelism_threads=intra_threads,  #1
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)
    sess_config.gpu_options.force_gpu_compatible = True

    #get data
    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    train_files = load_data(input_path_train, True, trn_sz, horovod)
    valid_files = load_data(input_path_validation, False, val_sz, horovod)

    #print some stats
    if comm_rank == 0:
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Decoder: {}".format(decoder))
        print("Batch normalization: {}".format(batchnorm))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Loss scale factor: {}".format(scale_factor))
        print("Output sampling target: {}".format(output_sampling))
        #print optimizer parameters
        for k, v in optimizer.items():
            print("Solver Parameters: {k}: {v}".format(k=k, v=v))
        print("Num training samples: {}".format(train_files.shape[0]))
        print("Num validation samples: {}".format(valid_files.shape[0]))
        if dummy_data:
            print("Using synthetic dummy data")
        print("Disable checkpoints: {}".format(disable_checkpoints))
        print("Disable image save: {}".format(disable_imsave))

    #compute epochs and stuff:
    if fs_type == "local":
        num_samples = train_files.shape[0] // comm_local_size
    else:
        num_samples = train_files.shape[0] // comm_size
    print("num_samples: {} batch: {}".format(num_samples, batch))
    num_steps_per_epoch = num_samples // batch
    num_steps = num_epochs * num_steps_per_epoch
    if comm_rank == 0:
        print("Number of steps per epoch: {}".format(num_steps_per_epoch))
        print("Number of steps in total: {}".format(num_steps))
    if per_rank_output:
        print("Rank {} does {} steps per epoch".format(comm_rank,
                                                       num_steps_per_epoch))

    with training_graph.as_default():

        if dummy_data:
            dummy_data_args = dict(batchsize=batch,
                                   data_format=data_format,
                                   dtype=dtype)
            trn_dataset = create_dummy_dataset(n_samples=trn_sz,
                                               num_epochs=num_epochs,
                                               **dummy_data_args)
            val_dataset = create_dummy_dataset(n_samples=val_sz,
                                               num_epochs=1,
                                               **dummy_data_args)
        else:
            #create readers
            trn_reader = h5_input_reader(input_path_train,
                                         channels,
                                         weights,
                                         dtype,
                                         normalization_file="stats.h5",
                                         update_on_read=False,
                                         data_format=data_format,
                                         label_id=label_id,
                                         sample_target=output_sampling)
            val_reader = h5_input_reader(input_path_validation,
                                         channels,
                                         weights,
                                         dtype,
                                         normalization_file="stats.h5",
                                         update_on_read=False,
                                         data_format=data_format,
                                         label_id=label_id)
            #create datasets
            if fs_type == "local":
                trn_dataset = create_dataset(trn_reader,
                                             train_files,
                                             batch,
                                             num_epochs,
                                             comm_local_size,
                                             comm_local_rank,
                                             dtype,
                                             shuffle=True)
                val_dataset = create_dataset(val_reader,
                                             valid_files,
                                             batch,
                                             1,
                                             comm_local_size,
                                             comm_local_rank,
                                             dtype,
                                             shuffle=False)
            else:
                trn_dataset = create_dataset(trn_reader,
                                             train_files,
                                             batch,
                                             num_epochs,
                                             comm_size,
                                             comm_rank,
                                             dtype,
                                             shuffle=True)
                val_dataset = create_dataset(val_reader,
                                             valid_files,
                                             batch,
                                             1,
                                             comm_size,
                                             comm_rank,
                                             dtype,
                                             shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype, tf.string),
            ((batch, len(channels), image_height_orig,
              image_width_orig) if data_format == "channels_first" else
             (batch, image_height_orig, image_width_orig, len(channels)),
             (batch, image_height_orig, image_width_orig),
             (batch, image_height_orig, image_width_orig), (batch)))
        next_elem = iterator.get_next()

        #if downsampling, do some preprocessing
        if downsampling_fact != 1:
            if downsampling_mode == "scale":
                #do downsampling
                rand_select = tf.cast(tf.one_hot(tf.random_uniform(
                    (batch, image_height, image_width),
                    minval=0,
                    maxval=downsampling_fact * downsampling_fact,
                    dtype=tf.int32),
                                                 depth=downsampling_fact *
                                                 downsampling_fact,
                                                 axis=-1),
                                      dtype=tf.int32)
                next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \
                             tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1, downsampling_fact, downsampling_fact, 1], \
                                                                                 [1,1,1,1], 'VALID'), rand_select), axis=-1), \
                             tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \
                             next_elem[3])
            elif downsampling_mode == "center-crop":
                #some parameters
                length = 1. / float(downsampling_fact)
                offset = length / 2.
                boxes = [[offset, offset, offset + length, offset + length]
                         ] * batch
                box_ind = list(range(0, batch))
                crop_size = [image_height, image_width]

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 2, 3, 1])

                #crop
                next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \
                             ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \
                             tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \
                             next_elem[3])

                #be careful with data order
                if data_format == "channels_first":
                    next_elem[0] = tf.transpose(next_elem[0],
                                                perm=[0, 3, 1, 2])

            else:
                raise ValueError(
                    "Error, downsampling mode {} not supported. Supported are [center-crop, scale]"
                    .format(downsampling_mode))

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #compute the input filter number based on number of channels used
        num_channels = len(channels)
        #set up model
        model = deeplab_v3_plus_generator(num_classes=3,
                                          output_stride=8,
                                          base_architecture=model,
                                          decoder=decoder,
                                          batchnorm=batchnorm,
                                          pre_trained_model=None,
                                          batch_norm_decay=None,
                                          data_format=data_format)

        logit, prediction = model(next_elem[0], True, dtype)

        #set up loss
        loss = None

        #cast the logits to fp32
        logit = ensure_type(logit, tf.float32)
        if loss_type == "weighted":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM)
            if scale_factor != 1.0:
                loss *= scale_factor

        elif loss_type == "weighted_mean":
            #cast weights to FP32
            w_cast = ensure_type(next_elem[2], tf.float32)
            loss = tf.losses.sparse_softmax_cross_entropy(
                labels=next_elem[1],
                logits=logit,
                weights=w_cast,
                reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
            if scale_factor != 1.0:
                loss *= scale_factor

        elif loss_type == "focal":
            #one-hot-encode
            labels_one_hot = tf.contrib.layers.one_hot_encoding(
                next_elem[1], 3)
            #cast to FP32
            labels_one_hot = ensure_type(labels_one_hot, tf.float32)
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)

        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))

        #determine flops
        flops = graph_flops.graph_flops(
            format="NHWC" if data_format == "channels_last" else "NCHW",
            verbose=False,
            batch=batch,
            sess_config=sess_config)
        flops *= comm_size
        if comm_rank == 0:
            print('training flops: {:.3f} TF/step'.format(flops * 1e-12))

        #number of trainable parameters
        if comm_rank == 0:
            num_params = get_number_of_trainable_parameters()
            print('number of trainable parameters: {} ({} MB)'.format(
                num_params,
                num_params * (4 if dtype == tf.float32 else 2) * (2**-20)))

        if horovod:
            loss_avg = hvd.allreduce(ensure_type(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)
        tmpl = (loss if per_rank_output else loss_avg)

        #set up global step - keep on CPU
        with tf.device('/device:CPU:0'):
            global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if optimizer['opt_type'].startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op, lr = get_larc_optimizer(optimizer, loss, global_step,
                                              num_steps_per_epoch, horovod)
        else:
            train_op, lr = get_optimizer(optimizer, loss, global_step,
                                         num_steps_per_epoch, horovod)

        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)

        if "gpu" in device.lower():
            with tf.device(device):
                mem_usage_ops = [
                    tf.contrib.memory_stats.MaxBytesInUse(),
                    tf.contrib.memory_stats.BytesLimit()
                ]
        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        #hooks = [tf.train.StopAtStepHook(last_step=3)]
        #hooks = [tf.train.StopAtStepHook(num_steps=3)]
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        nvtx_callback = NVTXHook(skip_n_steps=0, name='TTTTTrain')
        hooks.append(nvtx_callback)
        #bcast init for bcasting the model after start
        if horovod:
            init_bcast = hvd.broadcast_global_variables(0)
        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = 5 * num_steps_per_epoch
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            if (not disable_checkpoints):
                hooks.append(
                    tf.train.CheckpointSaverHook(
                        checkpoint_dir=checkpoint_dir,
                        save_steps=checkpoint_save_freq,
                        saver=checkpoint_saver))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        #tracing
        if tracing is not None:
            import tracehook
            tracing_hook = tracehook.TraceHook(steps_to_trace=tracing,
                                               cache_traces=True,
                                               trace_dir=trace_dir)
            hooks.append(tracing_hook)
            print("############ tracing enabled")

        # instead of averaging losses over an entire epoch, use a moving
        #  window average
        recent_losses = []
        loss_window_size = 10

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])
            #restore from checkpoint:
            if comm_rank == 0 and not disable_checkpoints:
                load_model(sess, checkpoint_saver, checkpoint_dir)
            #broadcast loaded model variables
            if horovod:
                sess.run(init_bcast)
            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string])
            #init iterators
            sess.run(trn_init_op, feed_dict={handle: trn_handle})
            sess.run(val_init_op, feed_dict={handle: val_handle})

            # figure out what step we're on (it won't be 0 if we are
            #  restoring from a checkpoint) so we can count from there
            train_steps = sess.run([global_step])[0]

            #do the training
            epoch = 1
            step = 1

            prev_mem_usage = 0
            t_sustained_start = time.time()
            r_peak = 0

            #warmup loops
            print("### Warmup for 5 steps")
            start_time = time.time()
            #while not sess.should_stop():
            for _ in range(5):
                #try:
                print('warmup train_steps is {}'.format(train_steps))
                if train_steps == 5:
                    #                    if have_pycuda:
                    #                        pyc.driver.start_profiler()
                    print(train_steps)
                _ = sess.run([train_op], feed_dict={handle: trn_handle})
                #tmp_loss = sess.run([(loss if per_rank_output else loss_avg)],feed_dict={handle: trn_handle})

                if train_steps == 5:
                    #                    if have_pycuda:
                    #                        pyc.driver.stop_profiler()
                    print(train_steps)

                train_steps += 1

            end_time = time.time()
            print("### Warmup time: {:0.2f}".format(end_time - start_time))

            ### Start profiling
            print('Begin training loop')
            #if have_cupy:
            #cupy.cuda.profiler.start()
            #            if have_pycuda:
            #                pyc.driver.start_profiler()
            #while not sess.should_stop():
            for _ in range(1):
                try:
                    print('train_steps is {}'.format(train_steps))
                    if train_steps == 5:
                        if have_pycuda:
                            pyc.driver.start_profiler()
                        print(train_steps)
                    _ = sess.run([tmpl], feed_dict={handle: trn_handle})
                    #                    _ = sess.run([train_op],feed_dict={handle: trn_handle})
                    if train_steps == 5:
                        if have_pycuda:
                            pyc.driver.stop_profiler()
                        print(train_steps)
                    train_steps += 1
                except tf.errors.OutOfRangeError:
                    break


#            if have_pycuda:
#                pyc.driver.stop_profiler()

### End of profiling
#if have_cupy:
#    cupy.cuda.profiler.stop()

# write any cached traces to disk
        if tracing is not None:
            tracing_hook.write_traces()

    print('All done')
Ejemplo n.º 28
0
def main():
   ''' simple starter program for tensorflow models. '''

   parser = argparse.ArgumentParser(description='')
   parser.add_argument('-c', '--config', dest='config_filename',
                       help='configuration filename in json format [default: %s]' % DEFAULT_CONFIG,
                       default=DEFAULT_CONFIG)
   parser.add_argument('--interop',
                       help='set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTEROP,
                       default=DEFAULT_INTEROP)
   parser.add_argument('--intraop',
                       help='set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTRAOP,
                       default=DEFAULT_INTRAOP)
   parser.add_argument('-l', '--logdir', default=DEFAULT_LOGDIR,
                       help='define location to save log information [default: %s]' % DEFAULT_LOGDIR)

   parser.add_argument('-r', '--restore', help='define location from which to load a saved model')

   parser.add_argument('--horovod', default=False, action='store_true', help="use horovod for MPI parallel training")
   parser.add_argument('--float16', default=False, action='store_true', help="use float16 precision training")

   parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG")
   parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR")
   parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR")
   parser.add_argument('--logfilename', dest='logfilename', default=None,
                       help='if set, logging information will go to file')
   args = parser.parse_args()

   if args.debug and not args.error and not args.warning:
      logging_level = logging.DEBUG
   elif not args.debug and args.error and not args.warning:
      logging_level = logging.ERROR
   elif not args.debug and not args.error and args.warning:
      logging_level = logging.WARNING

   logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s'
   logging_datefmt = '%Y-%m-%d %H:%M:%S'
   hvd = None
   rank = 0
   nranks = 1
   local_rank = 0
   local_nranks = 1
   logging_level = logging.INFO
   if args.horovod:
      import horovod
      import horovod.tensorflow as hvd
      hvd.init()
      rank = hvd.rank()
      nranks = hvd.size()
      local_rank = hvd.local_rank()
      local_nranks = hvd.local_size()
      os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())
      logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + (
               '%05d' % rank) + ':%(name)s:%(message)s'
      if rank > 0:
         logging_level = logging.WARNING

   logging.basicConfig(level=logging_level,
                       format=logging_format,
                       datefmt=logging_datefmt,
                       filename=args.logfilename)

   logging.warning('rank: %5d   size: %5d  local rank: %5d  local size: %5d', rank, nranks, local_rank, local_nranks)
   if 'CUDA_VISIBLE_DEVICES' in os.environ:
      logging.warning('CUDA_VISIBLE_DEVICES=%s %s', os.environ['CUDA_VISIBLE_DEVICES'], device_lib.list_local_devices())
   else:
      logging.info('CUDA_VISIBLE_DEVICES not defined in os.environ')
   logging.info('using tensorflow version:   %s', tf.__version__)
   logging.info('using tensorflow from:      %s', tf.__file__)
   if hvd:
      logging.info('using horovod version:      %s', horovod.__version__)
      logging.info('using horovod from:         %s', horovod.__file__)
   logging.info('logdir:                     %s', args.logdir)
   logging.info('interop:                    %s', args.interop)
   logging.info('intraop:                    %s', args.intraop)
   logging.info('restore:                    %s', args.restore)
   logging.info('float16:                    %s', args.float16)

   device_str = '/CPU:0'
   if tf.test.is_gpu_available():
      gpus = tf.config.experimental.list_logical_devices('GPU')
      logger.warning('gpus = %s', gpus)

      device_str = gpus[0].name

   logger.warning('device:                     %s', device_str)

   config = json.load(open(args.config_filename))
   config['device'] = device_str
   config['float16'] = args.float16

   nclasses = len(config['data']['classes'])
   dtype_input = tf.float16 if config['float16'] else tf.float32
   dtype_target = tf.int16 if config['float16'] else tf.int32
   bn = True if config['data']['batch_size'] > 1 else False

   logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
   logger.info('%s = \n %s', args.config_filename, json.dumps(config, indent=4, sort_keys=True))
   logger.info('-=-=-=-=-=-=-=-=-  CONFIG FILE -=-=-=-=-=-=-=-=-')
   if hvd:
      config['hvd'] = hvd
   config['rank'] = rank
   config['nranks'] = nranks

   with tf.Graph().as_default():

      logger.info('getting datasets')
      trainds, validds = data_handler.get_datasets(config)

      iterator = tf.compat.v1.data.Iterator.from_structure((dtype_input, dtype_target), (
      (config['data']['batch_size'], config['data']['num_points'], config['data']['num_features']),
      (config['data']['batch_size'], config['data']['num_points'])))
      input, target = iterator.get_next()
      training_init_op = iterator.make_initializer(trainds)
      validation_init_op = iterator.make_initializer(validds)

      with tf.device(device_str):

         # input, target = pointnet_seg.placeholder_inputs(config['data']['batch_size'],
         #                                                 config['data']['num_points'],
         #                                                 config['data']['num_features'])

         # handle = tf.compat.v1.placeholder(tf.string,shape=[])
         # iterator = tf.compat.v1.data.Iterator.from_string_handle(handle,(tf.float32,tf.int32),((config['data']['batch_size'],config['data']['num_points'],config['data']['num_features']),(config['data']['batch_size'],config['data']['num_points'])))
         # input,target = iterator.get_next()

         # iter_train = trainds.make_one_shot_iterator()
         # iter_valid = validds.make_one_shot_iterator()

         is_training_pl = tf.compat.v1.placeholder(tf.bool, shape=())
         batch = tf.Variable(0)


         pred, endpoints = pointnet_seg.get_model(input, is_training_pl, nclasses, dtype=dtype_input, bn=bn)
         loss = pointnet_seg.get_loss(pred, target, endpoints, dtype=dtype_input)
         tf.compat.v1.summary.scalar('loss/combined', loss)

         accuracy = pointnet_seg.get_accuracy(pred, target, dtype=dtype_input)
         tf.compat.v1.summary.scalar('accuracy/combined', accuracy)

         learning_rate = pointnet_seg.cyclic_learning_rate(batch * config['data']['batch_size'], config)
         tf.compat.v1.summary.scalar('learning_rate', learning_rate)
         optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)
         if hvd:
            optimizer = hvd.DistributedOptimizer(optimizer)
         train_op = optimizer.minimize(loss, global_step=batch)
         # grads_and_vars = optimizer.compute_gradients(loss, tf.trainable_variables())
         # train = optimizer.apply_gradients(grads_and_vars)

         # Add ops to save and restore all the variables.
         saver = tf.train.Saver()

         merged = tf.compat.v1.summary.merge_all()

      logger.info('create session')

      config_proto = tf.compat.v1.ConfigProto()
      if 'gpu' in device_str:
         config_proto.gpu_options.allow_growth = True
         config_proto.gpu_options.visible_device_list = os.environ['CUDA_VISIBLE_DEVICES']
      else:
         config_proto.allow_soft_placement = True
         config_proto.intra_op_parallelism_threads = args.intraop
         config_proto.inter_op_parallelism_threads = args.interop

      # Initialize an iterator over a dataset with 10 elements.
      sess = tf.compat.v1.Session(config=config_proto)

      if rank == 0:
         train_writer = tf.compat.v1.summary.FileWriter(os.path.join(args.logdir, 'train'), sess.graph)
         valid_writer = tf.compat.v1.summary.FileWriter(os.path.join(args.logdir, 'valid'), sess.graph)

      #    train_handle = sess.run(iter_train.string_handle())
      if args.restore:
         logger.info('restoring model: %s', args.restore)
         saver.restore(sess, args.restore)
      else:
         init = tf.compat.v1.global_variables_initializer()
         sess.run(init, {is_training_pl: True})
      if hvd:
         sess.run(hvd.broadcast_global_variables(0))

      logger.info('running over data')
      status_interval = config['training']['status']
      total_acc = 0.
      loss_sum = 0.
      for epoch in range(config['training']['epochs']):
         logger.info('epoch %s of %s  (logdir: %s)', epoch + 1, config['training']['epochs'], args.logdir)
         sess.run(training_init_op)

         start = time.time()
         while True:

            try:
               feed_dict = {is_training_pl: True}
               summary, step, _, loss_val, accuracy_val = sess.run([merged, batch,
                                                                    train_op, loss, accuracy], feed_dict=feed_dict)

               if rank == 0:
                  train_writer.add_summary(summary, step)

               total_acc += accuracy_val
               loss_sum += loss_val

               # logger.info(f'pred_val.shape   = {pred_val.shape}')
               # logger.info(f'target_val.shape = {target_val.shape}')

               if step % status_interval == 0:
                  end = time.time()
                  duration = end - start
                  logger.info('step: %10d   mean loss: %10.6f   accuracy:  %10.6f  imgs/sec: %10.6f',
                              step,
                              loss_sum / float(status_interval), total_acc / float(status_interval),
                              float(status_interval) * config['data']['batch_size'] / duration)
                  start = time.time()
                  total_acc = 0.
                  loss_sum = 0.
            except tf.errors.OutOfRangeError:
               ss = saver.save(sess, os.path.join(args.logdir, "model.ckpt"), global_step=step)

               logger.info(' end of epoch: %s', ss)
               break

         sess.run(validation_init_op)
         logger.info('running validation')
         total_acc = 0.
         total_loss = 0.
         steps = 0.
         while True:
            try:
               feed_dict = {is_training_pl: False}
               summary, valid_step, loss_val, accuracy_val = sess.run([merged, batch, loss, accuracy],
                                                                      feed_dict=feed_dict)
               total_acc += accuracy_val
               total_loss += loss_val
               steps += 1.
               logger.info('valid step: %s', valid_step)
               if rank == 0:
                  valid_writer.add_summary(summary, valid_step)
            except tf.errors.OutOfRangeError:
               total_loss = total_loss / steps
               total_acc = total_acc / steps
               logger.info(' end of validation  mean loss: %10.6f  mean acc: %10.6f', total_loss, total_acc)

               break
Ejemplo n.º 29
0
def main():
    """
  Main entry.
  """
    print("pid %i: Hello" % os.getpid())
    print("Python version:", sys.version)

    print("Env:")
    for key, value in sorted(os.environ.items()):
        print("%s=%s" % (key, value))
    print()

    if os.environ.get("PE_HOSTFILE", ""):
        try:
            print("PE_HOSTFILE, %s:" % os.environ["PE_HOSTFILE"])
            with open(os.environ["PE_HOSTFILE"], "r") as f:
                print(f.read())
        except FileNotFoundError as exc:
            print(exc)

    if os.environ.get("SGE_JOB_SPOOL_DIR", ""):
        print("SGE_JOB_SPOOL_DIR, %s:" % os.environ["SGE_JOB_SPOOL_DIR"])
        for name in os.listdir(os.environ["SGE_JOB_SPOOL_DIR"]):
            print(name)
        print()

    if os.environ.get("OMPI_FILE_LOCATION", ""):
        print("OMPI_FILE_LOCATION, %s:" % os.environ["OMPI_FILE_LOCATION"])
        d = os.path.dirname(os.path.dirname(os.environ["OMPI_FILE_LOCATION"]))
        print("dir:", d)
        for name in os.listdir(d):
            print(name)
        print()
        print("contact.txt:")
        with open("%s/contact.txt" % d, "r") as f:
            print(f.read())
        print()

    # https://github.com/horovod/horovod/issues/1123
    try:
        import ctypes
        ctypes.CDLL("libhwloc.so", mode=ctypes.RTLD_GLOBAL)
    except Exception as exc:
        print("Exception while loading libhwloc.so, ignoring...", exc)

    print("sys.path:")
    i = 0
    for p in list(sys.path):
        print(p)
        if "/.local/lib/" in p:
            # small workaround if the order is messed up... prefer from .local/lib
            print("(insert at position %i)" % i)
            sys.path.insert(i, p)
            i += 1
    print()

    try:
        from mpi4py import MPI  # noqa
        name = MPI.Get_processor_name()
        comm = MPI.COMM_WORLD
        print("mpi4py:", "name: %s," % name, "rank: %i," % comm.Get_rank(),
              "size: %i" % comm.Get_size())
        hosts = comm.allgather(
            (comm.Get_rank(), name))  # Get the names of all the other hosts
        print("  all hosts:", {key: item for (key, item) in hosts})
    except ImportError:
        print("mpi4py not available")

    print("Import TF now...")
    import tensorflow as tf
    print("TF version:", tf.__version__)

    import horovod  # noqa
    print("Horovod version:", horovod.__version__)
    import horovod.tensorflow as hvd  # noqa

    # Initialize Horovod
    hvd.init()

    print("pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" %
          (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(),
           hvd.local_size()))
Ejemplo n.º 30
0
#!/usr/bin/env python3

import os
print("pid %i: Hello" % os.getpid())

import tensorflow as tf
import horovod.tensorflow as hvd


# Initialize Horovod
hvd.init()

print("pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" % (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
Ejemplo n.º 31
0
def main(input_path, blocks, weights, image_dir, checkpoint_dir, trn_sz,
         learning_rate, loss_type, fs_type, opt_type, batch, batchnorm,
         num_epochs, dtype, chkpt, filter_sz, growth, disable_training,
         enable_tf_timeline):
    options = None
    run_metadata = None
    many_runs_timeline = None

    timeline_trace_fp = open("timeline_trace.pickle", "wb")

    options, run_metadata, many_runs_timeline, min_timeline_step, max_timeline_step = \
        init_timeline_configs(enable_tf_timeline, tf.RunOptions.FULL_TRACE, -1, -1)

    global_time_logger = logger(-1, "Global Total Time", -1, True)
    global_time_logger.start_timer()

    #init horovod

    initialization_timer_logger = logger(-1, "Initialize Horovod", -1, True)
    initialization_timer_logger.start_timer()

    nvtx.RangePush("init horovod", 1)
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))
    nvtx.RangePop()  # init horovod

    initialization_timer_logger.set_rank(int(comm_rank))
    initialization_timer_logger.end_timer()

    global_time_logger.set_rank(int(comm_rank))

    #parameters
    channels = [0, 1, 2, 10]
    per_rank_output = False
    loss_print_interval = 1

    #session config

    initialization_timer_logger.start_timer(comm_rank, "Configure Session")

    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=6,  #1
        intra_op_parallelism_threads=1,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)

    initialization_timer_logger.end_timer()

    #get data

    initialization_timer_logger.start_timer(comm_rank, "Get Data")

    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    trn_data, val_data, tst_data = load_data(input_path, trn_sz, comm_rank)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("done.")

    initialization_timer_logger.end_timer()

    #print some stats
    if comm_rank == 0:
        print("Learning Rate: {}".format(learning_rate))
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Batch normalization: {}".format(batchnorm))
        print("Blocks: {}".format(blocks))
        print("Growth rate: {}".format(growth))
        print("Filter size: {}".format(filter_sz))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Optimizer type: {}".format(opt_type))
        print("Num training samples: {}".format(trn_data.shape[0]))
        print("Num validation samples: {}".format(val_data.shape[0]))

    io_training_time_logger = logger(comm_rank, "IO and Training", -1, True)
    io_training_time_logger.start_timer()

    with training_graph.as_default():
        nvtx.RangePush("TF Init", 3)
        #create readers
        trn_reader = h5_input_reader(input_path,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     comm_rank=comm_rank)
        val_reader = h5_input_reader(input_path,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     comm_rank=comm_rank)
        #create datasets
        if fs_type == "local":
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=False)
        else:
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype),
            ((batch, len(channels), image_height, image_width),
             (batch, image_height, image_width),
             (batch, image_height, image_width)))
        next_elem = iterator.get_next()

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #set up model
        logit, prediction = create_tiramisu(3,
                                            next_elem[0],
                                            image_height,
                                            image_width,
                                            len(channels),
                                            loss_weights=weights,
                                            nb_layers_per_block=blocks,
                                            p=0.2,
                                            wd=1e-4,
                                            dtype=dtype,
                                            batchnorm=batchnorm,
                                            growth_rate=growth,
                                            filter_sz=filter_sz,
                                            comm_rank=comm_rank)

        #set up loss
        labels_one_hot = tf.cast(tf.contrib.layers.one_hot_encoding(
            next_elem[1], 3),
                                 dtype=dtype)
        loss = None
        if loss_type == "weighted":
            loss = tf.losses.softmax_cross_entropy(
                onehot_labels=labels_one_hot,
                logits=logit,
                weights=next_elem[2])
        elif loss_type == "focal":
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)
        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))
        if horovod:
            loss_avg = hvd.allreduce(tf.cast(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)

        #set up global step
        global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if opt_type.startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op = get_larc_optimizer(opt_type.split("-")[1],
                                          loss,
                                          global_step,
                                          learning_rate,
                                          LARC_mode="clip",
                                          LARC_eta=0.002,
                                          LARC_epsilon=1. / 16000.)
        else:
            train_op = get_optimizer(opt_type, loss, global_step,
                                     learning_rate)
        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)

        #compute epochs and stuff:
        if fs_type == "local":
            num_samples = trn_data.shape[0] // comm_local_size
        else:
            num_samples = trn_data.shape[0] // comm_size
        #num_steps_per_epoch = num_samples // batch
        num_steps_per_epoch = 10
        num_steps = num_epochs * num_steps_per_epoch
        if per_rank_output:
            print("Rank {} does {} steps per epoch".format(
                comm_rank, num_steps_per_epoch))

        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        #bcast init for bcasting the model after start
        init_bcast = hvd.broadcast_global_variables(0)
        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = num_steps_per_epoch * 2
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            listener = checkpoint_listener(comm_rank, True)
            hooks.append(
                tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir,
                                             save_steps=checkpoint_save_freq,
                                             saver=checkpoint_saver,
                                             listeners=[listener]))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        ##DEBUG
        ##summary
        #if comm_rank == 0:
        #    print("write graph for debugging")
        #    tf.summary.scalar("loss",loss)
        #    summary_op = tf.summary.merge_all()
        #    #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op))
        #    with tf.Session(config=sess_config) as sess:
        #        sess.run([init_op, init_local_op])
        #        #create iterator handles
        #        trn_handle = sess.run(trn_handle_string)
        #        #init iterators
        #        sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels})
        #        #summary:
        #        sess.run(summary_op, feed_dict={handle: trn_handle})
        #        #summary file writer
        #        summary_writer = tf.summary.FileWriter('./logs', sess.graph)
        ##DEBUG

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])

            #restore from checkpoint:
            if comm_rank == 0:
                load_model(sess, checkpoint_saver, checkpoint_dir, comm_rank)
            #broadcast loaded model variables
            sess.run(init_bcast)

            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string],
                options=options,
                run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "create_iterator_handle.json")

            #init iterators
            sess.run(trn_init_op,
                     feed_dict={handle: trn_handle},
                     options=options,
                     run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "init_train_iterator_handle.json")

            sess.run(val_init_op,
                     feed_dict={handle: val_handle},
                     options=options,
                     run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "init_val_iterator_handle.json")

            nvtx.RangePop()  # TF Init

            # do the training
            epoch = 1
            step = 1
            train_loss = 0.
            nvtx.RangePush("Training Loop", 4)
            nvtx.RangePush("Epoch", epoch)
            start_time = time.time()

            training_loop_timer_logger = logger(comm_rank, "Training Loop", -1,
                                                True)
            training_loop_timer_logger.start_timer()

            train_steps = 0
            while not (sess.should_stop()):
                #training loop
                try:
                    training_iteration_time_logger = logger(
                        comm_rank, "Training Iteration", epoch, True)
                    training_iteration_time_logger.start_timer()

                    nvtx.RangePush("Step", step)

                    if disable_training:
                        train_steps = sess.run([global_step],
                                               feed_dict={handle: trn_handle},
                                               options=options,
                                               run_metadata=run_metadata)

                        update_timeline_in_range(
                            enable_tf_timeline, run_metadata,
                            many_runs_timeline, train_steps[0],
                            "train_" + str(global_step) + ".json",
                            min_timeline_step, max_timeline_step)

                        train_steps_in_epoch = train_steps[
                            0] % num_steps_per_epoch

                        # do the validation phase
                        if train_steps_in_epoch == 0:
                            eval_steps = 0
                            while True:
                                try:
                                    sess.run([next_elem[1]],
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        "val_dict" + str(eval_steps) + ".json")

                                    eval_steps += 1
                                except tf.errors.OutOfRangeError:
                                    sess.run(val_init_op,
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline, "val_dict_out_" +
                                        str(eval_steps) + ".json")

                                    break

                    else:
                        # construct feed dict
                        _, train_steps, tmp_loss = sess.run(
                            [
                                train_op, global_step,
                                (loss if per_rank_output else loss_avg)
                            ],
                            feed_dict={handle: trn_handle},
                            options=options,
                            run_metadata=run_metadata)

                        update_timeline_in_range(
                            enable_tf_timeline, run_metadata,
                            many_runs_timeline, train_steps,
                            "val_" + str(global_step) + ".json",
                            min_timeline_step, max_timeline_step)

                        if comm_rank == 0:
                            step_trace_fp = open(
                                "train_step_trace_" + str(global_step) +
                                ".pickle", "wb")
                            pickle.dump(run_metadata, step_trace_fp)

                        train_steps_in_epoch = train_steps % num_steps_per_epoch
                        train_loss += tmp_loss
                        nvtx.RangePop()  # Step
                        step += 1

                        #print step report
                        eff_steps = train_steps_in_epoch if (
                            train_steps_in_epoch > 0) else num_steps_per_epoch
                        if (train_steps % loss_print_interval) == 0:
                            if per_rank_output:
                                print(
                                    "REPORT: rank {}, training loss for step {} (of {}) is {}, time {}"
                                    .format(comm_rank, train_steps, num_steps,
                                            train_loss / eff_steps,
                                            time.time() - start_time))
                            else:
                                if comm_rank == 0:
                                    print(
                                        "REPORT: training loss for step {} (of {}) is {}, time {}"
                                        .format(train_steps, num_steps,
                                                train_loss / eff_steps,
                                                time.time() - start_time))

                        #do the validation phase
                        if train_steps_in_epoch == 0:
                            end_time = time.time()
                            #print epoch report
                            train_loss /= num_steps_per_epoch
                            if per_rank_output:
                                print(
                                    "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {} s"
                                    .format(comm_rank, epoch, num_epochs,
                                            train_loss,
                                            time.time() - start_time))
                            else:
                                if comm_rank == 0:
                                    print(
                                        "COMPLETED: training loss for epoch {} (of {}) is {}, time {} s"
                                        .format(epoch, num_epochs, train_loss,
                                                time.time() - start_time))

                            #evaluation loop
                            eval_loss = 0.
                            eval_steps = 0
                            nvtx.RangePush("Eval Loop", 7)
                            timeline_help_count = 0
                            while True:
                                try:
                                    #construct feed dict
                                    _, tmp_loss, val_model_predictions, val_model_labels = sess.run(
                                        [
                                            iou_update_op,
                                            (loss
                                             if per_rank_output else loss_avg),
                                            prediction, next_elem[1]
                                        ],
                                        feed_dict={handle: val_handle},
                                        options=options,
                                        run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        timeline_help_count,
                                        "train_" + str(global_step) + ".json",
                                        min_timeline_step, max_timeline_step)

                                    if comm_rank == 0:
                                        step_trace_fp = open(
                                            "validation_step_trace_" +
                                            str(global_step) + ".pickle", "wb")
                                        pickle.dump(run_metadata,
                                                    step_trace_fp)

                                    timeline_help_count += 1

                                    #print some images
                                    if comm_rank == 0:
                                        if have_imsave:
                                            imsave(
                                                image_dir +
                                                '/test_pred_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2) * 100)
                                            imsave(
                                                image_dir +
                                                '/test_label_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                val_model_labels[0, ...] * 100)
                                            imsave(
                                                image_dir +
                                                '/test_combined_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                colormap[
                                                    val_model_labels[0, ...],
                                                    np.argmax(
                                                        val_model_predictions[
                                                            0, ...],
                                                        axis=2)])
                                        else:
                                            np.save(
                                                image_dir +
                                                '/test_pred_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.npy',
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2) * 100)
                                            np.save(
                                                image_dir +
                                                '/test_label_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.npy',
                                                val_model_labels[0, ...] * 100)

                                    eval_loss += tmp_loss
                                    eval_steps += 1
                                except tf.errors.OutOfRangeError:
                                    eval_steps = np.max([eval_steps, 1])
                                    eval_loss /= eval_steps
                                    if per_rank_output:
                                        print(
                                            "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}"
                                            .format(comm_rank, epoch,
                                                    num_epochs, eval_loss))
                                    else:
                                        if comm_rank == 0:
                                            print(
                                                "COMPLETED: evaluation loss for epoch {} (of {}) is {}"
                                                .format(
                                                    epoch, num_epochs,
                                                    eval_loss))
                                    if per_rank_output:
                                        iou_score = sess.run(iou_op)

                                        print(
                                            "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}"
                                            .format(comm_rank, epoch,
                                                    num_epochs, iou_score))
                                    else:
                                        iou_score = sess.run(iou_avg)

                                        if comm_rank == 0:
                                            print(
                                                "COMPLETED: evaluation IoU for epoch {} (of {}) is {}"
                                                .format(
                                                    epoch, num_epochs,
                                                    iou_score))
                                    sess.run(iou_reset_op)

                                    sess.run(val_init_op,
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        "train_" + str(global_step) + ".json")

                                    if comm_rank == 0:
                                        step_trace_fp = open(
                                            "validation_step_trace_out.pickle",
                                            "wb")
                                        pickle.dump(run_metadata,
                                                    step_trace_fp)

                                    break
                            nvtx.RangePop()  # Eval Loop

                    if enable_tf_timeline:
                        many_runs_timeline.save('Timeliner_output.json')

                    # reset counters
                    epoch += 1
                    train_loss = 0.
                    step = 0

                    nvtx.RangePop()  # Epoch
                    nvtx.RangePush("Epoch", epoch)

                    training_iteration_time_logger.end_timer()

                except tf.errors.OutOfRangeError:
                    break

            nvtx.RangePop()  # Epoch
            nvtx.RangePop()  # Training Loop

            training_loop_timer_logger.end_timer()

    if enable_tf_timeline:
        many_runs_timeline.save('Timeliner_output.json')

    io_training_time_logger.end_timer()
    global_time_logger.end_timer()
Ejemplo n.º 32
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.freeze(False)  # populate new keys now
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
    _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR)
    if isinstance(_C.DATA.VAL, six.string_types
                  ):  # support single string (the typical case) as well
        _C.DATA.VAL = (_C.DATA.VAL, )

    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN',
                                'None'], _C.BACKBONE.NORM
    if _C.BACKBONE.NORM != 'FreezeBN':
        assert not _C.BACKBONE.FREEZE_AFFINE
    assert _C.BACKBONE.FREEZE_AT in [0, 1, 2]

    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[
        3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(
            _C.PREPROC.MAX_SIZE / size_mult) * size_mult
        assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
        assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.NORM in ['None', 'GN']

        if _C.FPN.CASCADE:
            # the first threshold is the proposal sampling threshold
            assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
            assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS)

    if is_training:
        train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
        if isinstance(
                train_scales,
            (list, tuple)) and train_scales[1] - train_scales[0] > 100:
            # don't autotune if augmentation is on
            os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()

            if ngpu == hvd.local_size():
                logger.warn(
                    "It's not recommended to use horovod for single-machine training. "
                    "Replicated trainer is more stable and has the same efficiency."
                )
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu > 0, "Has to train with GPU!"
        assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(
            ngpu)
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        ngpu = get_num_gpu()

    if _C.TRAIN.NUM_GPUS is None:
        _C.TRAIN.NUM_GPUS = ngpu
    else:
        if _C.TRAINER == 'horovod':
            assert _C.TRAIN.NUM_GPUS == ngpu
        else:
            assert _C.TRAIN.NUM_GPUS <= ngpu

    _C.freeze()
    logger.info("Config: ------------------------------------------\n" +
                str(_C))
Ejemplo n.º 33
0
args, unknown = parser.parse_known_args()
print(args)

size, rank, local_size, local_rank = None, None, None, None
if args.nccl:
    import horovod.tensorflow as dist
else:
    import smdistributed.dataparallel.tensorflow as dist
    import smddpcommon as hm

    hm.setBucketSize(args.bucket_size * 1024 * 1024)

dist.init()
size = dist.size()
rank = dist.rank()
local_size = dist.local_size()
local_rank = dist.local_rank()

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], 'GPU')

if args.fp32:
    DTYPE, DTSIZE = tf.dtypes.float32, 4
else:
    DTYPE, DTSIZE = tf.dtypes.float16, 2


@tf.function