def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': all_results = multithread_eval_coco(self.dataflows, self.predictors) else: filenames = [os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, rank) ) for rank in range(hvd.local_size())] if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) fname = filenames[hvd.local_rank()] with open(fname, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for fname in filenames: with open(fname, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(fname) output_file = os.path.join( logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_coco_metrics(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def _eval(self): logdir = self._output_dir if cfg.TRAINER == 'replicated': all_results = multithread_predict_dataflow(self.dataflows, self.predictors) else: filenames = [ os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)) for rank in range(hvd.local_size()) ] if self._horovod_run_eval: local_results = predict_dataflow(self.dataflow, self.predictor) fname = filenames[hvd.local_rank()] with open(fname, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for fname in filenames: with open(fname, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(fname) scores = DatasetRegistry.get( self._eval_dataset).eval_inference_results(all_results) for k, v in scores.items(): self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)
def __init__(self, config): """ :param Config config: """ # noinspection PyUnresolvedReferences,PyPackageRequirements import horovod.tensorflow as hvd hvd.init() print( "Horovod initialized. Hostname %s, pid %i, rank %i / size %i, local rank %i / local size %i." % ( socket.gethostname(), os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size())) self._config = config self._hvd_mod = hvd self._local_rank = hvd.local_rank() self._local_size = hvd.local_size() self._rank = hvd.rank() self._size = hvd.size()
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750 buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)] # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu if buggy_tf else num_gpu * 2 self.predictors = [ self._build_predictor(k % num_gpu) for k in range(self.num_predictor) ] self.dataflows = [ get_eval_dataflow(self._eval_dataset, shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor) ] else: # Only eval on the first machine, # Because evaluation assumes that all horovod workers share the filesystem. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_predictor(0) self.dataflow = get_eval_dataflow(self._eval_dataset, shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
def set_up_graph(self, trainer: tp.Trainer) -> None: self.trainer = trainer if self.trainer_type == "replicated": # Use multiple predictor threads per GPU to get better throughput. self.num_predictor = self.num_gpus * 2 self.predictors = [ self._build_predictor(k % self.num_gpus) for k in range(self.num_predictor) ] self.dataflows = [ get_eval_dataflow( # type: ignore self._eval_dataset, self.is_aws, self.is_gcs, shard=k, num_shards=self.num_predictor, ) for k in range(self.num_predictor) ] else: if self.machine_rank == 0: # Run validation on one machine. self.predictor = self._build_predictor(0) self.dataflow = get_eval_dataflow( self._eval_dataset, self.is_aws, self.is_gcs, shard=hvd.local_rank(), num_shards=hvd.local_size(), ) # All workers must take part in this barrier, even if they # are not performing validation. self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
def local_size(cls, *args): """Get the number of workers at the current node.""" try: return mgw.local_size(*args) except NameError: raise NameError('module <mgw> not imported')
def print_header(): import horovod.tensorflow as hvd if hvd.rank() == 0: text = """ _ _ _ _ (_) | | | | | | ___ ___ _ _ __ ___ | | | |__ ___ _ __ ___| |__ / __|/ __| | '_ ` _ \| | | '_ \ / _ \ '_ \ / __| '_ \\ \__ \ (__| | | | | | | | | |_) | __/ | | | (__| | | | |___/\___|_|_| |_| |_|_| |_.__/ \___|_| |_|\___|_| |_| """ sys.stdout.write(text) sys.stdout.write("\n\n") LOGGER.info('Version: %s', sciml_bench.__version__) from mpi4py import MPI data = (MPI.Get_processor_name(), hvd.local_size()) _comm = MPI.COMM_WORLD data = _comm.bcast(data, root=0) data = [data] if not isinstance(data, list) else data plurality = 'es' if len(data) > 1 else '' for node_name, local_size in data: LOGGER.info('%s has %s process%s', node_name, local_size, plurality)
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu * 2 self.predictors = [ self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor) ] self.dataflows = [ get_eval_dataflow(shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor) ] else: if hvd.size() > hvd.local_size(): logger.warn( "Distributed evaluation with horovod is unstable. Sometimes MPI hangs for unknown reasons." ) self.predictor = self._build_coco_predictor(0) self.dataflow = get_eval_dataflow(shard=hvd.rank(), num_shards=hvd.size()) # use uint8 to aggregate strings self.local_result_tensor = tf.placeholder( tf.uint8, shape=[None], name='local_result_string') self.concat_results = hvd.allgather(self.local_result_tensor, name='concat_results') local_size = tf.expand_dims(tf.size(self.local_result_tensor), 0) self.string_lens = hvd.allgather(local_size, name='concat_sizes')
def dataset_options(self): options = tf.data.Options() options.experimental_deterministic = not self._is_training options.experimental_optimization.map_parallelization = self._enable_map_parallelization options.experimental_optimization.parallel_batch = True options.threading.private_threadpool_size = max( 2, (multiprocessing.cpu_count() // hvd.local_size()) - 2) return options
def read_tf_records(batch_size, tf_records, num_repeats=1, shuffle_records=True, shuffle_examples=True, shuffle_buffer_size=None, interleave=True, filter_amount=1.0): """ Args: batch_size: batch size to return tf_records: a list of tf_record filenames num_repeats: how many times the data should be read (default: One) shuffle_records: whether to shuffle the order of files read shuffle_examples: whether to shuffle the tf.Examples shuffle_buffer_size: how big of a buffer to fill before shuffling. interleave: iwhether to interleave examples from multiple tf_records filter_amount: what fraction of records to keep Returns: a tf dataset of batched tensors """ if shuffle_examples and not shuffle_buffer_size: raise ValueError("Must set shuffle buffer size if shuffling examples") #tf_records = list(tf_records) # disable suffle for sharding #if shuffle_records: # random.shuffle(tf_records) record_list = tf.data.Dataset.from_tensor_slices(tf_records) logging.info('hvd rank {}, local rank {}, size {}, shard_size {}'.format(hvd.rank(), hvd.local_rank(), hvd.size(), hvd.local_size())) record_list = record_list.shard(hvd.local_size(), hvd.local_rank()) # compression_type here must agree with write_tf_examples map_func = functools.partial( tf.data.TFRecordDataset, buffer_size=8 * 1024 * 1024, compression_type='ZLIB') if interleave: # cycle_length = how many tfrecord files are read in parallel # The idea is to shuffle both the order of the files being read, # and the examples being read from the files. dataset = record_list.apply(tf.data.experimental.parallel_interleave( map_func, cycle_length=64, sloppy=True)) else: dataset = record_list.flat_map(map_func) if filter_amount < 1.0: dataset = dataset.filter( lambda _: tf.random_uniform([]) < filter_amount) dataset = dataset.repeat(num_repeats) if shuffle_examples: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) dataset = dataset.batch(batch_size) return dataset
def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': # with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \ with ThreadPoolExecutor(max_workers=self.num_predictor) as executor, \ tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar: futures = [] for dataflow, pred in zip(self.dataflows, self.predictors): futures.append( executor.submit(eval_coco, dataflow, pred, pbar)) all_results = list( itertools.chain(*[fut.result() for fut in futures[:1]])) all_results_test = list( itertools.chain(*[fut.result() for fut in futures[1:2]])) else: if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank())) with open(output_partial, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for k in range(hvd.local_size()): output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, k)) with open(output_partial, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(output_partial) output_file = os.path.join(logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.") output_file_test = os.path.join( logdir, 'outputs-test{}.json'.format(self.global_step)) with open(output_file_test, 'w') as f: json.dump(all_results_test, f)
def init_workers(distributed=False): if distributed and not no_horovod: hvd.init() assert hvd.mpi_threads_supported() from mpi4py import MPI assert hvd.size() == MPI.COMM_WORLD.Get_size() comm = MPI.COMM_WORLD print("Rank: {}, Size: {}".format(hvd.rank(), hvd.size())) return SimpleNamespace(rank=hvd.rank(), size=hvd.size(), local_rank=hvd.local_rank(), local_size=hvd.local_size(), comm=comm) else: print("not doing distributed") return SimpleNamespace(rank=0, size=1, local_rank=0, local_size=1, comm=None)
def test_horovod_adasum_multiple_allreduce_gpu_nccl(self): """Test on GPU using NCCL that the Adasum correctly computes 2D tensors.""" hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled() or not hvd.gpu_available( 'tensorflow') or not hvd.nccl_built(): self.skipTest("MPI, GPU or NCCL not available") rank = hvd.rank() rank_tensors = [] size = hvd.size() # TODO support testing with non-power 2 ranks if not is_power2(size): self.skipTest("MPI rank is not power of 2") local_size = hvd.local_size() # Only run on homogeneous cluster if not hvd.is_homogeneous(): self.skipTest("Horovod cluster is not homogeneous") num_nodes = int(size / local_size) for _ in range(size): rank_tensors.append([ np.random.random_sample((2, 2)), np.random.random_sample((2, 2)) ]) sum_local_ranks_tensor = [] for i in range(num_nodes): sum_local_ranks_tensor.append([np.zeros((2, 2)), np.zeros((2, 2))]) for j in range(local_size): sum_local_ranks_tensor[i] = np.add(sum_local_ranks_tensor[i], rank_tensors[j]) answer = reference_tree_reduction(sum_local_ranks_tensor, num_nodes) answer = np.true_divide(answer, local_size) for dtype in [tf.float16, tf.float32, tf.float64]: with tf.device("/gpu:{}".format(hvd.local_rank())): tensors = map(tf.constant, rank_tensors[rank]) # cast to the corresponding dtype tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors) # and away we go: do reduction reduced_tensors = [ self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum)) for tensor in tensors ] # cast expected result to the type of the tensorflow values np_type = dtype.as_numpy_dtype tmp = [t.astype(np_type) for t in answer] self.assertAllCloseAccordingToType(tmp, reduced_tensors)
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu * 2 self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)] self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor)] else: # Only eval on the first machine. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_coco_predictor(0) self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu * 2 self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)] self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor)] else: # Only eval on the first machine. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_coco_predictor(0) self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
def compute_validation_metrics(self) -> Any: if self.trainer_type == "replicated": all_results = multithread_predict_dataflow( self.dataflows, self.predictors ) # type: ignore else: filenames = [ os.path.join( self._output_dir, "outputs{}-part{}.json".format(self.trainer.global_step, rank) ) for rank in range(hvd.local_size()) ] if self.machine_rank == 0: local_results = predict_dataflow(self.dataflow, self.predictor) fname = filenames[hvd.local_rank()] with open(fname, "w") as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for fname in filenames: with open(fname, "r") as f: obj = json.load(f) all_results.extend(obj) output_file = os.path.join( self._output_dir, "{}-outputs{}-{}.json".format( self._eval_dataset, self.trainer.global_step, time.time() ), ) metrics = DatasetRegistry.get(self._eval_dataset).eval_inference_results( # type: ignore all_results, output_file ) # If there are no detections, the metrics result is totally empty, instead of containing # zeroes. Ensure that the main evaluation metric has some value. metrics.setdefault("mAP(bbox)/IoU=0.5:0.95", 0) return metrics
def _eval(self): logdir = args.logdir if cfg.TRAINER == 'replicated': with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \ tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar: futures = [] for dataflow, pred in zip(self.dataflows, self.predictors): futures.append(executor.submit(eval_coco, dataflow, pred, pbar)) all_results = list(itertools.chain(*[fut.result() for fut in futures])) else: if self._horovod_run_eval: local_results = eval_coco(self.dataflow, self.predictor) output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank())) with open(output_partial, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for k in range(hvd.local_size()): output_partial = os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, k)) with open(output_partial, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(output_partial) output_file = os.path.join( logdir, 'outputs{}.json'.format(self.global_step)) with open(output_file, 'w') as f: json.dump(all_results, f) try: scores = print_evaluation_scores(output_file) for k, v in scores.items(): self.trainer.monitors.put_scalar(k, v) except Exception: logger.exception("Exception in COCO evaluation.")
def main(input_path_train, input_path_validation, downsampling_fact, downsampling_mode, channels, data_format, label_id, blocks, weights, image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, fs_type, optimizer, batch, batchnorm, num_epochs, dtype, chkpt, filter_sz, growth, disable_checkpoints, disable_imsave, tracing, trace_dir, output_sampling, scale_factor): #init horovod nvtx.RangePush("init horovod", 1) comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) nvtx.RangePop() # init horovod #downsampling? recompute image dims image_height = image_height_orig // downsampling_fact image_width = image_width_orig // downsampling_fact #parameters per_rank_output = False loss_print_interval = 10 #session config sess_config = tf.ConfigProto( inter_op_parallelism_threads=6, #1 intra_op_parallelism_threads=1, #6 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) sess_config.gpu_options.force_gpu_compatible = True #get data training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") trn_data = load_data(input_path_train, True, trn_sz, horovod) val_data = load_data(input_path_validation, False, val_sz, horovod) if comm_rank == 0: print("Shape of trn_data is {}".format(trn_data.shape[0])) print("done.") #print some stats if comm_rank == 0: print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Batch normalization: {}".format(batchnorm)) print("Blocks: {}".format(blocks)) print("Growth rate: {}".format(growth)) print("Filter size: {}".format(filter_sz)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Loss scale factor: {}".format(scale_factor)) print("Output sampling target: {}".format(output_sampling)) #print optimizer parameters for k, v in optimizer.items(): print("Solver Parameters: {k}: {v}".format(k=k, v=v)) #print("Optimizer type: {}".format(optimizer['opt_type'])) print("Num training samples: {}".format(trn_data.shape[0])) print("Num validation samples: {}".format(val_data.shape[0])) print("Disable checkpoints: {}".format(disable_checkpoints)) print("Disable image save: {}".format(disable_imsave)) print("Downsampling factor: {}".format(downsampling_fact)) print("Downsampling mode: {}".format(downsampling_mode)) #compute epochs and stuff: if fs_type == "local": num_samples = trn_data.shape[0] // comm_local_size else: num_samples = trn_data.shape[0] // comm_size num_steps_per_epoch = num_samples // batch num_steps = num_epochs * num_steps_per_epoch if per_rank_output: print("Rank {} does {} steps per epoch".format(comm_rank, num_steps_per_epoch)) with training_graph.as_default(): nvtx.RangePush("TF Init", 3) #create readers trn_reader = h5_input_reader(input_path_train, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id, sample_target=output_sampling) val_reader = h5_input_reader(input_path_validation, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id) #create datasets if fs_type == "local": trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_local_size, comm_local_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_size, comm_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype, tf.string), ((batch, len(channels), image_height_orig, image_width_orig) if data_format == "channels_first" else (batch, image_height_orig, image_width_orig, len(channels)), (batch, image_height_orig, image_width_orig), (batch, image_height_orig, image_width_orig), (batch))) next_elem = iterator.get_next() #if downsampling, do some preprocessing if downsampling_fact != 1: if downsampling_mode == "scale": #do downsampling rand_select = tf.cast(tf.one_hot(tf.random_uniform( (batch, image_height, image_width), minval=0, maxval=downsampling_fact * downsampling_fact, dtype=tf.int32), depth=downsampling_fact * downsampling_fact, axis=-1), dtype=tf.int32) next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \ tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \ [1, downsampling_fact, downsampling_fact, 1], \ [1, downsampling_fact, downsampling_fact, 1], \ [1,1,1,1], 'VALID'), rand_select), axis=-1), \ tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \ next_elem[3]) elif downsampling_mode == "center-crop": #some parameters length = 1. / float(downsampling_fact) offset = length / 2. boxes = [[offset, offset, offset + length, offset + length] ] * batch box_ind = list(range(0, batch)) crop_size = [image_height, image_width] #be careful with data order if data_format == "channels_first": next_elem = (tf.transpose(next_elem[0], perm=[0, 2, 3, 1]), next_elem[1], next_elem[2], next_elem[3]) #crop next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \ ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \ tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \ next_elem[3]) #be careful with data order if data_format == "channels_first": next_elem = (tf.transpose(next_elem[0], perm=[0, 3, 1, 2]), next_elem[1], next_elem[2], next_elem[3]) elif downsampling_mode == "random-crop": #some parameters crop_size = [ batch, image_height, image_width, len(channels) + 2 ] #concatenate input, crop, split apart crop_input = tf.concat([next_elem[0] if data_format=="channels_last" else tf.transpose(next_elem[0], perm=[0,2,3,1]), \ ensure_type(tf.expand_dims(next_elem[1], axis=-1), tf.float32), \ tf.expand_dims(next_elem[2], axis=-1)], \ axis = -1) crop_output = tf.image.random_crop(crop_input, crop_size) #restore iterator output crop_image = crop_output[:, :, :, :len(channels)] crop_label = ensure_type(crop_output[:, :, :, len(channels)], tf.int32) crop_weight = crop_output[:, :, :, len(channels) + 1] next_elem = (crop_image if data_format=="channels_last" else tf.transpose(crop_image, perm=[0,3,1,2]), \ crop_label, crop_weight, next_elem[3]) else: raise ValueError( "Error, downsampling mode {} not supported. Supported are [center-crop, random-crop, scale]" .format(downsampling_mode)) #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #compute the input filter number based on number of channels used num_channels = len(channels) nb_filter = 64 #set up model logit, prediction = create_tiramisu(3, next_elem[0], image_height, image_width, num_channels, loss_weights=weights, nb_layers_per_block=blocks, p=0.2, wd=1e-4, dtype=dtype, batchnorm=batchnorm, growth_rate=growth, nb_filter=nb_filter, filter_sz=filter_sz, median_filter=False, data_format=data_format) #prediction_argmax = median_pool(prediction_argmax, 3, strides=[1,1,1,1]) #set up loss loss = None if loss_type == "weighted": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "focal": labels_one_hot = tf.contrib.layers.one_hot_encoding( next_elem[1], 3) labels_one_hot = ensure_type(labels_one_hot, dtype) loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) #determine flops flops = graph_flops.graph_flops( format="NHWC" if data_format == "channels_last" else "NCHW", batch=batch, sess_config=sess_config) flops *= comm_size if comm_rank == 0: print('training flops: {:.3f} TF/step'.format(flops * 1e-12)) #number of trainable parameters if comm_rank == 0: num_params = get_number_of_trainable_parameters() print('number of trainable parameters: {} ({} MB)'.format( num_params, num_params * (4 if dtype == tf.float32 else 2) * (2**-20))) if horovod: loss_avg = hvd.allreduce(ensure_type(loss, tf.float32)) else: loss_avg = tf.identity(loss) #set up global step - keep on CPU with tf.device('/device:CPU:0'): global_step = tf.train.get_or_create_global_step() #set up optimizer if optimizer['opt_type'].startswith("LARC"): if comm_rank == 0: print("Enabling LARC") train_op, lr = get_larc_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) else: train_op, lr = get_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1], predictions=tf.argmax( prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) if horovod: iou_avg = hvd.allreduce(iou_op) else: iou_avg = tf.identity(iou_op) #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)] #bcast init for bcasting the model after start if horovod: init_bcast = hvd.broadcast_global_variables(0) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = 5 * num_steps_per_epoch checkpoint_saver = tf.train.Saver(max_to_keep=1000) if (not disable_checkpoints): hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver)) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) if tracing is not None: import tracehook tracing_hook = tracehook.TraceHook(steps_to_trace=tracing, cache_traces=True, trace_dir=trace_dir) hooks.append(tracing_hook) # instead of averaging losses over an entire epoch, use a moving # window average recent_losses = [] loss_window_size = 10 #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: if comm_rank == 0 and not disable_checkpoints: load_model(sess, checkpoint_saver, checkpoint_dir) #broadcast loaded model variables if horovod: sess.run(init_bcast) #create iterator handles trn_handle, val_handle = sess.run( [trn_handle_string, val_handle_string]) #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}) sess.run(val_init_op, feed_dict={handle: val_handle}) nvtx.RangePop() # TF Init # figure out what step we're on (it won't be 0 if we are # restoring from a checkpoint) so we can count from there train_steps = sess.run([global_step])[0] #do the training epoch = 1 step = 1 t_sustained_start = time.time() nvtx.RangePush("Training Loop", 4) nvtx.RangePush("Epoch", epoch) start_time = time.time() while not sess.should_stop(): #training loop try: nvtx.RangePush("Step", step) #construct feed dict t_inst_start = time.time() _, tmp_loss = sess.run( [train_op, (loss if per_rank_output else loss_avg)], feed_dict={handle: trn_handle}) t_inst_end = time.time() train_steps += 1 train_steps_in_epoch = train_steps % num_steps_per_epoch recent_losses = [tmp_loss ] + recent_losses[0:loss_window_size - 1] train_loss = sum(recent_losses) / len(recent_losses) nvtx.RangePop() # Step step += 1 #print step report eff_steps = train_steps_in_epoch if ( train_steps_in_epoch > 0) else num_steps_per_epoch if (train_steps % loss_print_interval) == 0: if per_rank_output: print( "REPORT: rank {}, training loss for step {} (of {}) is {}, time {:.3f}" .format(comm_rank, train_steps, num_steps, train_loss, time.time() - start_time)) else: if comm_rank == 0: print( "REPORT: training loss for step {} (of {}) is {}, time {:.3f}, r_inst {:.3f}" .format( train_steps, num_steps, train_loss, time.time() - start_time, 1e-12 * flops / (t_inst_end - t_inst_start))) #do the validation phase if train_steps_in_epoch == 0: end_time = time.time() #print epoch report if per_rank_output: print( "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}" .format( comm_rank, epoch, num_epochs, train_loss, time.time() - start_time, 1e-12 * flops * num_steps_per_epoch / (end_time - t_sustained_start))) else: if comm_rank == 0: print( "COMPLETED: training loss for epoch {} (of {}) is {}, time {:.3f}, r_sust {:.3f}" .format( epoch, num_epochs, train_loss, time.time() - start_time, 1e-12 * flops * num_steps_per_epoch / (end_time - t_sustained_start))) #evaluation loop eval_loss = 0. eval_steps = 0 nvtx.RangePush("Eval Loop", 7) while True: try: #construct feed dict _, tmp_loss, val_model_predictions, val_model_labels, val_model_filenames = sess.run( [ iou_update_op, (loss if per_rank_output else loss_avg), prediction, next_elem[1], next_elem[3] ], feed_dict={handle: val_handle}) #print some images if comm_rank == 0 and not disable_imsave: if have_imsave: imsave( image_dir + '/test_pred_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', np.argmax( val_model_predictions[0, ...], axis=2) * 100) imsave( image_dir + '/test_label_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', val_model_labels[0, ...] * 100) imsave( image_dir + '/test_combined_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', colormap[ val_model_labels[0, ...], np.argmax( val_model_predictions[0, ...], axis=2)]) else: np.savez( image_dir + '/test_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npz', prediction=np.argmax( val_model_predictions[0, ...], axis=2) * 100, label=val_model_labels[0, ...] * 100, filename=val_model_filenames[0]) eval_loss += tmp_loss eval_steps += 1 except tf.errors.OutOfRangeError: eval_steps = np.max([eval_steps, 1]) eval_loss /= eval_steps if per_rank_output: print( "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, eval_loss)) else: if comm_rank == 0: print( "COMPLETED: evaluation loss for epoch {} (of {}) is {}" .format(epoch, num_epochs, eval_loss)) if per_rank_output: iou_score = sess.run(iou_op) print( "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, iou_score)) else: iou_score = sess.run(iou_avg) if comm_rank == 0: print( "COMPLETED: evaluation IoU for epoch {} (of {}) is {}" .format(epoch, num_epochs, iou_score)) sess.run(iou_reset_op) sess.run(val_init_op, feed_dict={handle: val_handle}) break nvtx.RangePop() # Eval Loop #reset counters epoch += 1 step = 0 t_sustained_start = time.time() nvtx.RangePop() # Epoch nvtx.RangePush("Epoch", epoch) except tf.errors.OutOfRangeError: break nvtx.RangePop() # Epoch nvtx.RangePop() # Training Loop # write any cached traces to disk if tracing is not None: tracing_hook.write_traces()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.horovod: hvd.init() if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps hvd_rank = 0 hvd_local_rank = 0 config = tf.ConfigProto() learning_rate = FLAGS.learning_rate if FLAGS.horovod: tf.logging.info("Multi-GPU training with TF Horovod") tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) global_batch_size = FLAGS.train_batch_size * hvd.size( ) * FLAGS.num_accumulation_steps learning_rate = learning_rate * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() hvd_local_rank = hvd.local_rank() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append( LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps)) # Prepare Training Data if FLAGS.do_train: train_examples = read_squad_examples( input_file=FLAGS.train_file, is_training=True, version_2_with_negative=FLAGS.version_2_with_negative) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.local_size()) ] num_examples_per_local_rank = len( train_examples) // hvd.local_size() remainder = len(train_examples) % hvd.local_size() if hvd.local_rank() < remainder: start_index = hvd.local_rank() * (num_examples_per_local_rank + 1) end_index = start_index + num_examples_per_local_rank + 1 else: start_index = hvd.local_rank( ) * num_examples_per_local_rank + remainder end_index = start_index + (num_examples_per_local_rank) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, hvd=None if not FLAGS.horovod else hvd, use_fp16=FLAGS.use_fp16) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. train_writer = FeatureWriter(filename=tmp_filenames[hvd_local_rank], is_training=True) convert_examples_to_features( examples=train_examples[start_index:end_index], tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature, verbose_logging=FLAGS.verbose_logging) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", end_index - start_index) tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) tf.logging.info(" LR = %f", learning_rate) del train_examples if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) with tf.Session(config=config) as sess: sess.run(barrier) train_input_fn = input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=num_train_steps) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.logging.info("-----------------------------") tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.logging.info("-----------------------------") if FLAGS.export_trtis and master_process: export_model(estimator, FLAGS.output_dir, FLAGS.init_checkpoint) if FLAGS.do_predict and master_process: eval_examples = read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative) # Perform evaluation on subset, useful for profiling if FLAGS.num_eval_iterations is not None: eval_examples = eval_examples[:FLAGS.num_eval_iterations * FLAGS.predict_batch_size] eval_writer = FeatureWriter(filename=os.path.join( FLAGS.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature, verbose_logging=FLAGS.verbose_logging) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = input_fn_builder( input_file=eval_writer.filename, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) all_results = [] eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] eval_start_time = time.time() for result in estimator.predict(predict_input_fn, yield_single_examples=True, hooks=eval_hooks): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) eval_time_elapsed = time.time() - eval_start_time eval_time_wo_overhead = eval_hooks[-1].total_time time_list = eval_hooks[-1].time_list time_list.sort() num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.logging.info("-----------------------------") tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size) tf.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size) tf.logging.info("Summary Inference Statistics") tf.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.logging.info("-----------------------------") output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
print('Unable to call \n' '`tf.compat.v1.enable_resource_variables()`. Continuing...') try: import horovod import horovod.tensorflow as hvd try: RANK = hvd.rank() except ValueError: hvd.init() RANK = hvd.rank() SIZE = hvd.size() HAS_HOROVOD = True IS_CHIEF = (RANK == 0) LOCAL_SIZE = hvd.local_size() LOCAL_RANK = hvd.local_rank() # logging.info(f'using horovod from: {horovod.__file__}') # logging.info(f'using horovod version: {horovod.__version__}') prefix = f'{RANK} / {SIZE} ::' if IS_CHIEF: print(80 * '=') print(f'{prefix} Using tensorflow version: {tf.__version__}') print(f'{prefix} Using tensorflow from: {tf.__file__}') print(f'{prefix} Using horovod version: {horovod.__version__}') print(f'{prefix} Using horovod from: {horovod.__file__}') print(80 * '=') else: print(f'Hello, im rank: {RANK} of {SIZE} total ranks') GPUS = tf.config.experimental.list_physical_devices('GPU')
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) lr_scaler = hvd.size() # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL, # scale lr by local_size if args.use_adasum: lr_scaler = hvd.local_size() if hvd.nccl_built() else 1 # Horovod: adjust learning rate based on lr_scaler. opt = tf.train.AdamOptimizer(args.lr * lr_scaler) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer( opt, op=hvd.Adasum if args.use_adasum else hvd.Average) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=args.num_steps // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def main(): ''' simple starter program for tensorflow models. ''' logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s' logging_datefmt = '%Y-%m-%d %H:%M:%S' logging_level = logging.INFO parser = argparse.ArgumentParser(description='') parser.add_argument( '-c', '--config', dest='config_filename', help='configuration filename in json format [default: %s]' % DEFAULT_CONFIG, default=DEFAULT_CONFIG) parser.add_argument( '--interop', type=int, help= 'set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTEROP, default=DEFAULT_INTEROP) parser.add_argument( '--intraop', type=int, help= 'set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTRAOP, default=DEFAULT_INTRAOP) parser.add_argument( '-l', '--logdir', default=DEFAULT_LOGDIR, help='define location to save log information [default: %s]' % DEFAULT_LOGDIR) parser.add_argument('--horovod', dest='horovod', default=False, action='store_true', help="Use horovod") parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG") parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--logfilename', dest='logfilename', default=None, help='if set, logging information will go to file') args = parser.parse_args() hvd = None if args.horovod: import horovod import horovod.tensorflow as hvd hvd.init() logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + ( '%05d' % hvd.rank()) + ':%(name)s:%(message)s' if hvd.rank() > 0: logging_level = logging.WARNING if args.debug and not args.error and not args.warning: logging_level = logging.DEBUG os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' elif not args.debug and args.error and not args.warning: logging_level = logging.ERROR elif not args.debug and not args.error and args.warning: logging_level = logging.WARNING logging.basicConfig(level=logging_level, format=logging_format, datefmt=logging_datefmt, filename=args.logfilename) if 'CUDA_VISIBLE_DEVICES' in os.environ: logging.warning('CUDA_VISIBLE_DEVICES=%s %s', os.environ['CUDA_VISIBLE_DEVICES'], device_lib.list_local_devices()) else: logging.info('CUDA_VISIBLE_DEVICES not defined in os.environ') logging.info('using tensorflow version: %s', tf.__version__) logging.info('using tensorflow from: %s', tf.__file__) if hvd: logging.warning( 'rank: %5d size: %5d local rank: %5d local size: %5d', hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()) logging.info('using horovod version: %s', horovod.__version__) logging.info('using horovod from: %s', horovod.__file__) logging.info('logdir: %s', args.logdir) logging.info('interop: %s', args.interop) logging.info('intraop: %s', args.intraop) device_str = '/CPU:0' if tf.test.is_gpu_available(): # device_str = '/device:GPU:' + str(hvd.local_rank()) gpus = tf.config.experimental.list_logical_devices('GPU') logger.warning('gpus = %s', gpus) # assert hvd.local_rank() < len(gpus), f'localrank = {hvd.local_rank()} len(gpus) = {len(gpus)}' device_str = gpus[0].name # logger.info('device_str = %s',device_str) logger.warning('device: %s', device_str) config = json.load(open(args.config_filename)) config['device'] = device_str config['hvd'] = hvd logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') logger.info('%s = \n %s', args.config_filename, json.dumps(config, indent=4, sort_keys=True)) logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') config['hvd'] = hvd with tf.Graph().as_default(): logger.info('getting datasets') trainds, validds = data_handler.get_datasets(config) input_shape = (config['data']['batch_size'], ) + tuple( config['data']['image_shape']) target_shape = (config['data']['batch_size'], config['data']['image_shape'][0]) iterator = tf.compat.v1.data.Iterator.from_structure( (tf.float32, tf.int32), (input_shape, target_shape)) input, target = iterator.get_next() training_init_op = iterator.make_initializer(trainds) valid_init_op = iterator.make_initializer(validds) with tf.device(device_str): is_training_pl = tf.compat.v1.placeholder(tf.bool, shape=()) batch = tf.Variable(0) batch_size = tf.constant(config['data']['batch_size']) pred, endpoints = model.get_model(input, is_training_pl, config) logger.info('pred = %s target = %s', pred.shape, target.shape) # pred = BxC, target = BxC loss = losses.get_loss(config)(labels=target, logits=pred) #tf.compat.v1.summary.scalar('loss/combined',loss) #learning_rate = pointnet_seg.get_learning_rate(batch,config) * hvd.size() learning_rate = lr_func.get_learning_rate(batch * batch_size, config) tf.compat.v1.summary.scalar('learning_rate', learning_rate) if config['optimizer']['name'] == 'adam': optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate) # adding Horovod distributed optimizer if hvd: optimizer = hvd.DistributedOptimizer(optimizer) # create the training operator train_op = optimizer.minimize(loss, global_step=batch) # Add ops to save and restore all the variables. saver = tf.compat.v1.train.Saver() merged = tf.compat.v1.summary.merge_all() logger.info('create session') config_proto = tf.compat.v1.ConfigProto() if 'gpu' in device_str: config_proto.gpu_options.allow_growth = True config_proto.gpu_options.visible_device_list = os.environ[ 'CUDA_VISIBLE_DEVICES'] else: config_proto.allow_soft_placement = True config_proto.intra_op_parallelism_threads = args.intraop config_proto.inter_op_parallelism_threads = args.interop # Initialize an iterator over a dataset with 10 elements. sess = tf.compat.v1.Session(config=config_proto) # create tensorboard writers if hvd and hvd.rank() == 0: train_writer = tf.compat.v1.summary.FileWriter( os.path.join(args.logdir, 'train'), sess.graph) valid_writer = tf.compat.v1.summary.FileWriter( os.path.join(args.logdir, 'valid'), sess.graph) # initialize global vars and horovod broadcast initial model init = tf.compat.v1.global_variables_initializer() sess.run(init, {is_training_pl: True}) if hvd: sess.run(hvd.broadcast_global_variables(0)) logger.info('running over data') status_interval = config['training']['status'] loss_sum = 0. for epoch in range(config['training']['epochs']): logger.info('epoch %s of %s', epoch + 1, config['training']['epochs']) # initialize the data iterator for training loop sess.run(training_init_op) # training loop start = time.time() while True: try: # set that we are training feed_dict = {is_training_pl: True} summary, step, _, loss_val = sess.run( [merged, batch, train_op, loss], feed_dict=feed_dict) # report status periodically if step % status_interval == 0: end = time.time() duration = end - start logger.info( 'step: %10d imgs/sec: %10.6f', step, float(status_interval) * config['data']['batch_size'] / duration) start = time.time() # exception thrown when data is done except tf.errors.OutOfRangeError: logger.info(' end of epoch ') saver.save(sess, os.path.join(args.logdir, "model.ckpt"), global_step=step) break logger.info('running validation') # initialize the validation data iterator sess.run(valid_init_op) steps = 0.
sys.path.insert(i, p) i += 1 print() try: from mpi4py import MPI name = MPI.Get_processor_name() comm = MPI.COMM_WORLD print("mpi4py:", "name: %s," % name, "rank: %i," % comm.Get_rank(), "size: %i" % comm.Get_size()) hosts = comm.allgather( (comm.Get_rank(), name)) # Get the names of all the other hosts print(" all hosts:", {key: item for (key, item) in hosts}) except ImportError: print("mpi4py not available") print("Import TF now...") import tensorflow as tf print("TF version:", tf.__version__) import horovod print("Horovod version:", horovod.__version__) import horovod.tensorflow as hvd # Initialize Horovod hvd.init() print( "pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" % (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
def log_csv(model, batch_size, device, num_devices, num_devices_per_node, disable_ib, disable_nccl_p2p, img_sec_mean, img_sec_conf, total_img_sec_mean, total_img_sec_conf): if hvd.rank() != 0: return with open('/var/scratch/sdhar/logs/tensorflow_synthetic.csv', 'a', newline='') as f: csvwriter = csv.writer(f, lineterminator="\n") csvwriter.writerow([ model, batch_size, device, num_devices, num_devices_per_node, disable_ib, disable_nccl_p2p, img_sec_mean, img_sec_conf, total_img_sec_mean, total_img_sec_conf ]) log_csv( args.model, str(args.batch_size), device, str(hvd.size()), str(hvd.local_size()), #Disable infiniband str(args.disable_ib), #Disable NCCL P2P Communication str(args.disable_p2p), str(img_sec_mean), str(img_sec_conf), str(hvd.size() * img_sec_mean), str(hvd.size() * img_sec_conf))
else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" config.gpu_options.allow_growth = False config.gpu_options.visible_device_list = '' if args.eager: tf.enable_eager_execution(config) # Set up standard model. model = getattr(applications, args.model)(weights=None) lr_scaler = hvd.size() # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL, # scale lr by local_size if args.use_adasum: lr_scaler = hvd.local_size() if args.cuda and hvd.nccl_built() else 1 opt = tf.train.GradientDescentOptimizer(0.01 * lr_scaler) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. opt = hvd.DistributedOptimizer( opt, compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) init = tf.global_variables_initializer() bcast_op = hvd.broadcast_global_variables(0)
def main(): ''' simple starter program for tensorflow models. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('-c','--config',dest='config_filename',help='configuration filename in json format [default: %s]' % DEFAULT_CONFIG,default=DEFAULT_CONFIG) parser.add_argument('--interop',type=int,help='set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTEROP,default=DEFAULT_INTEROP) parser.add_argument('--intraop',type=int,help='set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTRAOP,default=DEFAULT_INTRAOP) parser.add_argument('-l','--logdir',default=DEFAULT_LOGDIR,help='define location to save log information [default: %s]' % DEFAULT_LOGDIR) parser.add_argument('--horovod', default=False, action='store_true', help="Use MPI with horovod") parser.add_argument('--profiler',default=False, action='store_true', help='Use TF profiler, needs CUPTI in LD_LIBRARY_PATH for Cuda') parser.add_argument('--profrank',default=0,type=int,help='set which rank to profile') parser.add_argument('--batch-term',dest='batch_term',type=int,help='if set, terminates training after the specified number of batches',default=0) parser.add_argument('--evaluate',help='evaluate a pre-trained model file on the test data set only.') parser.add_argument('--train-more',dest='train_more',help='load a pre-trained model file and continue training.') parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG") parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--logfilename',dest='logfilename',default=None,help='if set, logging information will go to file') args = parser.parse_args() hvd = None rank = 0 nranks = 1 logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s' logging_datefmt = '%Y-%m-%d %H:%M:%S' logging_level = logging.INFO if args.horovod: print('importing horovod') sys.stdout.flush() sys.stderr.flush() import horovod import horovod.tensorflow as hvd hvd.init() logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + ( '%05d' % hvd.rank()) + ':%(name)s:%(message)s' rank = hvd.rank() nranks = hvd.size() if rank > 0: logging_level = logging.WARNING # Setup Logging if args.debug and not args.error and not args.warning: logging_level = logging.DEBUG os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' elif not args.debug and args.error and not args.warning: logging_level = logging.ERROR elif not args.debug and not args.error and args.warning: logging_level = logging.WARNING logging.basicConfig(level=logging_level, format=logging_format, datefmt=logging_datefmt, filename=args.logfilename) if hvd: logging.warning('host: %s rank: %5d size: %5d local rank: %5d local size: %5d', socket.gethostname(),hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()) tf.config.threading.set_inter_op_parallelism_threads(args.interop) tf.config.threading.set_intra_op_parallelism_threads(args.intraop) # Setup GPUs gpus = tf.config.list_physical_devices('GPU') logger.info( 'number of gpus: %s',len(gpus)) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if hvd and len(gpus) > 0: tf.config.set_visible_devices(gpus[hvd.local_rank() % len(gpus)],'GPU') logging.info( 'using tensorflow version: %s (%s)',tf.__version__,tf.__git_version__) logging.info( 'using tensorflow from: %s',tf.__file__) if hvd: logging.info('using horovod version: %s',horovod.__version__) logging.info('using horovod from: %s',horovod.__file__) logging.info( 'logdir: %s',args.logdir) logging.info( 'interop: %s',args.interop) logging.info( 'intraop: %s',args.intraop) # this must be created after the config settings gtape = tf.GradientTape() if args.horovod: gtape = hvd.DistributedGradientTape(gtape) config = json.load(open(args.config_filename)) # config['device'] = device_str config['profrank'] = args.profrank config['profiler'] = args.profiler config['logdir'] = args.logdir config['rank'] = rank config['nranks'] = nranks config['evaluate'] = False config['batch_term'] = args.batch_term if args.batch_term > 0: config['training']['epochs'] = 1 config['training']['status'] = 1 if args.batch_term < config['training']['status'] else config['training']['status'] if args.evaluate is not None: config['evaluate'] = True config['model_file'] = args.evaluate config['training']['epochs'] = 1 logger.info('evaluating model file: %s',args.evaluate) elif args.train_more is not None: config['train_more'] = True config['model_file'] = args.train_more logger.info('continuing model file: %s',args.train_more) # using mixed precision? if isinstance(config['model']['mixed_precision'],str): logger.info('using mixed precsion: %s',config['model']['mixed_precision']) tf.keras.mixed_precision.set_global_policy(config['model']['mixed_precision']) logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') logger.info('%s = \n %s',args.config_filename,json.dumps(config,indent=4,sort_keys=True)) logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') config['hvd'] = hvd sys.stdout.flush() sys.stderr.flush() trainds,testds = data_handler.get_datasets(config) logger.info('get model') net = model.get_model(config) loss_func = losses.get_loss(config) opt = get_optimizer(config) if isinstance(config['model']['mixed_precision'],str): opt = tf.keras.mixed_precision.LossScaleOptimizer(opt) # initialize and create the model # input_shape = [config['data']['batch_size'],config['data']['num_points'],config['data']['num_features']] # output = net(tf.random.uniform(input_shape)) # load previous model weights if args.evaluate: net.load_weights(args.evaluate) elif args.train_more: net.load_weights(args.train_more) # # synchronize models across ranks # if hvd: # hvd.broadcast_variables(net.variables, root_rank=0) # hvd.broadcast_variables(opt.variables(), root_rank=0) train_summary_writer = None test_summary_writer = None test_jet_writer = None test_ele_writer = None test_bkg_writer = None test_mean_writer = None if rank == 0: train_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'train') test_summary_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'test') test_jet_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'jet_iou') test_ele_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'ele_iou') test_bkg_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'bkg_iou') test_mean_writer = tf.summary.create_file_writer(args.logdir + os.path.sep + 'mean_iou') #tf.keras.utils.plot_model(net, "network_model.png", show_shapes=True) #with train_summary_writer.as_default(): #tf.summary.graph(train_step.get_concrete_function().graph) batches_per_epoch = 0 train_mIoU_sum = 0. test_mIoU_sum = 0. for epoch_num in range(config['training']['epochs']): logger.info('begin epoch %s',epoch_num) if not config['evaluate']: train_output = epoch_loop.one_train_epoch(config,trainds,net, loss_func,opt,epoch_num, train_summary_writer, batches_per_epoch, gtape) batches_per_epoch = train_output['batches_per_epoch'] train_mIoU_sum += train_output['mIoU'] logger.info('train mIoU sum: %10.4f',train_mIoU_sum / (epoch_num + 1)) test_output = epoch_loop.one_eval_epoch(config,testds,net, loss_func,opt,epoch_num, test_summary_writer, batches_per_epoch, test_jet_writer, test_ele_writer, test_bkg_writer, test_mean_writer) test_mIoU_sum += test_output['mIoU'] logger.info('test mIoU sum: %10.4f',test_mIoU_sum / (epoch_num + 1)) if rank == 0: with test_summary_writer.as_default(): step = (epoch_num + 1) * batches_per_epoch tf.summary.scalar('metrics/mIoU_AOC', test_mIoU_sum / (epoch_num + 1),step=step)
def main(device, input_path_train, input_path_validation, dummy_data, downsampling_fact, downsampling_mode, channels, data_format, label_id, weights, image_dir, checkpoint_dir, trn_sz, val_sz, loss_type, model, decoder, fs_type, optimizer, batch, batchnorm, num_epochs, dtype, disable_checkpoints, disable_imsave, tracing, trace_dir, output_sampling, scale_factor, intra_threads, inter_threads): #init horovod comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) #downsampling? recompute image dims image_height = image_height_orig // downsampling_fact image_width = image_width_orig // downsampling_fact #parameters per_rank_output = False loss_print_interval = 1 #session config sess_config = tf.ConfigProto( inter_op_parallelism_threads=inter_threads, #6 intra_op_parallelism_threads=intra_threads, #1 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) sess_config.gpu_options.force_gpu_compatible = True #get data training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") train_files = load_data(input_path_train, True, trn_sz, horovod) valid_files = load_data(input_path_validation, False, val_sz, horovod) #print some stats if comm_rank == 0: print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Decoder: {}".format(decoder)) print("Batch normalization: {}".format(batchnorm)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Loss scale factor: {}".format(scale_factor)) print("Output sampling target: {}".format(output_sampling)) #print optimizer parameters for k, v in optimizer.items(): print("Solver Parameters: {k}: {v}".format(k=k, v=v)) print("Num training samples: {}".format(train_files.shape[0])) print("Num validation samples: {}".format(valid_files.shape[0])) if dummy_data: print("Using synthetic dummy data") print("Disable checkpoints: {}".format(disable_checkpoints)) print("Disable image save: {}".format(disable_imsave)) #compute epochs and stuff: if fs_type == "local": num_samples = train_files.shape[0] // comm_local_size else: num_samples = train_files.shape[0] // comm_size print("num_samples: {} batch: {}".format(num_samples, batch)) num_steps_per_epoch = num_samples // batch num_steps = num_epochs * num_steps_per_epoch if comm_rank == 0: print("Number of steps per epoch: {}".format(num_steps_per_epoch)) print("Number of steps in total: {}".format(num_steps)) if per_rank_output: print("Rank {} does {} steps per epoch".format(comm_rank, num_steps_per_epoch)) with training_graph.as_default(): if dummy_data: dummy_data_args = dict(batchsize=batch, data_format=data_format, dtype=dtype) trn_dataset = create_dummy_dataset(n_samples=trn_sz, num_epochs=num_epochs, **dummy_data_args) val_dataset = create_dummy_dataset(n_samples=val_sz, num_epochs=1, **dummy_data_args) else: #create readers trn_reader = h5_input_reader(input_path_train, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id, sample_target=output_sampling) val_reader = h5_input_reader(input_path_validation, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, data_format=data_format, label_id=label_id) #create datasets if fs_type == "local": trn_dataset = create_dataset(trn_reader, train_files, batch, num_epochs, comm_local_size, comm_local_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, valid_files, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: trn_dataset = create_dataset(trn_reader, train_files, batch, num_epochs, comm_size, comm_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, valid_files, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype, tf.string), ((batch, len(channels), image_height_orig, image_width_orig) if data_format == "channels_first" else (batch, image_height_orig, image_width_orig, len(channels)), (batch, image_height_orig, image_width_orig), (batch, image_height_orig, image_width_orig), (batch))) next_elem = iterator.get_next() #if downsampling, do some preprocessing if downsampling_fact != 1: if downsampling_mode == "scale": #do downsampling rand_select = tf.cast(tf.one_hot(tf.random_uniform( (batch, image_height, image_width), minval=0, maxval=downsampling_fact * downsampling_fact, dtype=tf.int32), depth=downsampling_fact * downsampling_fact, axis=-1), dtype=tf.int32) next_elem = (tf.layers.average_pooling2d(next_elem[0], downsampling_fact, downsampling_fact, 'valid', data_format), \ tf.reduce_max(tf.multiply(tf.image.extract_image_patches(tf.expand_dims(next_elem[1], axis=-1), \ [1, downsampling_fact, downsampling_fact, 1], \ [1, downsampling_fact, downsampling_fact, 1], \ [1,1,1,1], 'VALID'), rand_select), axis=-1), \ tf.squeeze(tf.layers.average_pooling2d(tf.expand_dims(next_elem[2], axis=-1), downsampling_fact, downsampling_fact, 'valid', "channels_last"), axis=-1), \ next_elem[3]) elif downsampling_mode == "center-crop": #some parameters length = 1. / float(downsampling_fact) offset = length / 2. boxes = [[offset, offset, offset + length, offset + length] ] * batch box_ind = list(range(0, batch)) crop_size = [image_height, image_width] #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 2, 3, 1]) #crop next_elem = (tf.image.crop_and_resize(next_elem[0], boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="data_cropping"), \ ensure_type(tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[1],axis=-1), boxes, box_ind, crop_size, method='nearest', extrapolation_value=0, name="label_cropping"), axis=-1), tf.int32), \ tf.squeeze(tf.image.crop_and_resize(tf.expand_dims(next_elem[2],axis=-1), boxes, box_ind, crop_size, method='bilinear', extrapolation_value=0, name="weight_cropping"), axis=-1), \ next_elem[3]) #be careful with data order if data_format == "channels_first": next_elem[0] = tf.transpose(next_elem[0], perm=[0, 3, 1, 2]) else: raise ValueError( "Error, downsampling mode {} not supported. Supported are [center-crop, scale]" .format(downsampling_mode)) #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #compute the input filter number based on number of channels used num_channels = len(channels) #set up model model = deeplab_v3_plus_generator(num_classes=3, output_stride=8, base_architecture=model, decoder=decoder, batchnorm=batchnorm, pre_trained_model=None, batch_norm_decay=None, data_format=data_format) logit, prediction = model(next_elem[0], True, dtype) #set up loss loss = None #cast the logits to fp32 logit = ensure_type(logit, tf.float32) if loss_type == "weighted": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "weighted_mean": #cast weights to FP32 w_cast = ensure_type(next_elem[2], tf.float32) loss = tf.losses.sparse_softmax_cross_entropy( labels=next_elem[1], logits=logit, weights=w_cast, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS) if scale_factor != 1.0: loss *= scale_factor elif loss_type == "focal": #one-hot-encode labels_one_hot = tf.contrib.layers.one_hot_encoding( next_elem[1], 3) #cast to FP32 labels_one_hot = ensure_type(labels_one_hot, tf.float32) loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) #determine flops flops = graph_flops.graph_flops( format="NHWC" if data_format == "channels_last" else "NCHW", verbose=False, batch=batch, sess_config=sess_config) flops *= comm_size if comm_rank == 0: print('training flops: {:.3f} TF/step'.format(flops * 1e-12)) #number of trainable parameters if comm_rank == 0: num_params = get_number_of_trainable_parameters() print('number of trainable parameters: {} ({} MB)'.format( num_params, num_params * (4 if dtype == tf.float32 else 2) * (2**-20))) if horovod: loss_avg = hvd.allreduce(ensure_type(loss, tf.float32)) else: loss_avg = tf.identity(loss) tmpl = (loss if per_rank_output else loss_avg) #set up global step - keep on CPU with tf.device('/device:CPU:0'): global_step = tf.train.get_or_create_global_step() #set up optimizer if optimizer['opt_type'].startswith("LARC"): if comm_rank == 0: print("Enabling LARC") train_op, lr = get_larc_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) else: train_op, lr = get_optimizer(optimizer, loss, global_step, num_steps_per_epoch, horovod) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1], predictions=tf.argmax( prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) if horovod: iou_avg = hvd.allreduce(iou_op) else: iou_avg = tf.identity(iou_op) if "gpu" in device.lower(): with tf.device(device): mem_usage_ops = [ tf.contrib.memory_stats.MaxBytesInUse(), tf.contrib.memory_stats.BytesLimit() ] #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end #hooks = [tf.train.StopAtStepHook(last_step=3)] #hooks = [tf.train.StopAtStepHook(num_steps=3)] hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)] nvtx_callback = NVTXHook(skip_n_steps=0, name='TTTTTrain') hooks.append(nvtx_callback) #bcast init for bcasting the model after start if horovod: init_bcast = hvd.broadcast_global_variables(0) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = 5 * num_steps_per_epoch checkpoint_saver = tf.train.Saver(max_to_keep=1000) if (not disable_checkpoints): hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver)) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) #tracing if tracing is not None: import tracehook tracing_hook = tracehook.TraceHook(steps_to_trace=tracing, cache_traces=True, trace_dir=trace_dir) hooks.append(tracing_hook) print("############ tracing enabled") # instead of averaging losses over an entire epoch, use a moving # window average recent_losses = [] loss_window_size = 10 #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: if comm_rank == 0 and not disable_checkpoints: load_model(sess, checkpoint_saver, checkpoint_dir) #broadcast loaded model variables if horovod: sess.run(init_bcast) #create iterator handles trn_handle, val_handle = sess.run( [trn_handle_string, val_handle_string]) #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}) sess.run(val_init_op, feed_dict={handle: val_handle}) # figure out what step we're on (it won't be 0 if we are # restoring from a checkpoint) so we can count from there train_steps = sess.run([global_step])[0] #do the training epoch = 1 step = 1 prev_mem_usage = 0 t_sustained_start = time.time() r_peak = 0 #warmup loops print("### Warmup for 5 steps") start_time = time.time() #while not sess.should_stop(): for _ in range(5): #try: print('warmup train_steps is {}'.format(train_steps)) if train_steps == 5: # if have_pycuda: # pyc.driver.start_profiler() print(train_steps) _ = sess.run([train_op], feed_dict={handle: trn_handle}) #tmp_loss = sess.run([(loss if per_rank_output else loss_avg)],feed_dict={handle: trn_handle}) if train_steps == 5: # if have_pycuda: # pyc.driver.stop_profiler() print(train_steps) train_steps += 1 end_time = time.time() print("### Warmup time: {:0.2f}".format(end_time - start_time)) ### Start profiling print('Begin training loop') #if have_cupy: #cupy.cuda.profiler.start() # if have_pycuda: # pyc.driver.start_profiler() #while not sess.should_stop(): for _ in range(1): try: print('train_steps is {}'.format(train_steps)) if train_steps == 5: if have_pycuda: pyc.driver.start_profiler() print(train_steps) _ = sess.run([tmpl], feed_dict={handle: trn_handle}) # _ = sess.run([train_op],feed_dict={handle: trn_handle}) if train_steps == 5: if have_pycuda: pyc.driver.stop_profiler() print(train_steps) train_steps += 1 except tf.errors.OutOfRangeError: break # if have_pycuda: # pyc.driver.stop_profiler() ### End of profiling #if have_cupy: # cupy.cuda.profiler.stop() # write any cached traces to disk if tracing is not None: tracing_hook.write_traces() print('All done')
def main(): ''' simple starter program for tensorflow models. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('-c', '--config', dest='config_filename', help='configuration filename in json format [default: %s]' % DEFAULT_CONFIG, default=DEFAULT_CONFIG) parser.add_argument('--interop', help='set Tensorflow "inter_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTEROP, default=DEFAULT_INTEROP) parser.add_argument('--intraop', help='set Tensorflow "intra_op_parallelism_threads" session config varaible [default: %s]' % DEFAULT_INTRAOP, default=DEFAULT_INTRAOP) parser.add_argument('-l', '--logdir', default=DEFAULT_LOGDIR, help='define location to save log information [default: %s]' % DEFAULT_LOGDIR) parser.add_argument('-r', '--restore', help='define location from which to load a saved model') parser.add_argument('--horovod', default=False, action='store_true', help="use horovod for MPI parallel training") parser.add_argument('--float16', default=False, action='store_true', help="use float16 precision training") parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG") parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--logfilename', dest='logfilename', default=None, help='if set, logging information will go to file') args = parser.parse_args() if args.debug and not args.error and not args.warning: logging_level = logging.DEBUG elif not args.debug and args.error and not args.warning: logging_level = logging.ERROR elif not args.debug and not args.error and args.warning: logging_level = logging.WARNING logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:%(name)s:%(message)s' logging_datefmt = '%Y-%m-%d %H:%M:%S' hvd = None rank = 0 nranks = 1 local_rank = 0 local_nranks = 1 logging_level = logging.INFO if args.horovod: import horovod import horovod.tensorflow as hvd hvd.init() rank = hvd.rank() nranks = hvd.size() local_rank = hvd.local_rank() local_nranks = hvd.local_size() os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank()) logging_format = '%(asctime)s %(levelname)s:%(process)s:%(thread)s:' + ( '%05d' % rank) + ':%(name)s:%(message)s' if rank > 0: logging_level = logging.WARNING logging.basicConfig(level=logging_level, format=logging_format, datefmt=logging_datefmt, filename=args.logfilename) logging.warning('rank: %5d size: %5d local rank: %5d local size: %5d', rank, nranks, local_rank, local_nranks) if 'CUDA_VISIBLE_DEVICES' in os.environ: logging.warning('CUDA_VISIBLE_DEVICES=%s %s', os.environ['CUDA_VISIBLE_DEVICES'], device_lib.list_local_devices()) else: logging.info('CUDA_VISIBLE_DEVICES not defined in os.environ') logging.info('using tensorflow version: %s', tf.__version__) logging.info('using tensorflow from: %s', tf.__file__) if hvd: logging.info('using horovod version: %s', horovod.__version__) logging.info('using horovod from: %s', horovod.__file__) logging.info('logdir: %s', args.logdir) logging.info('interop: %s', args.interop) logging.info('intraop: %s', args.intraop) logging.info('restore: %s', args.restore) logging.info('float16: %s', args.float16) device_str = '/CPU:0' if tf.test.is_gpu_available(): gpus = tf.config.experimental.list_logical_devices('GPU') logger.warning('gpus = %s', gpus) device_str = gpus[0].name logger.warning('device: %s', device_str) config = json.load(open(args.config_filename)) config['device'] = device_str config['float16'] = args.float16 nclasses = len(config['data']['classes']) dtype_input = tf.float16 if config['float16'] else tf.float32 dtype_target = tf.int16 if config['float16'] else tf.int32 bn = True if config['data']['batch_size'] > 1 else False logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') logger.info('%s = \n %s', args.config_filename, json.dumps(config, indent=4, sort_keys=True)) logger.info('-=-=-=-=-=-=-=-=- CONFIG FILE -=-=-=-=-=-=-=-=-') if hvd: config['hvd'] = hvd config['rank'] = rank config['nranks'] = nranks with tf.Graph().as_default(): logger.info('getting datasets') trainds, validds = data_handler.get_datasets(config) iterator = tf.compat.v1.data.Iterator.from_structure((dtype_input, dtype_target), ( (config['data']['batch_size'], config['data']['num_points'], config['data']['num_features']), (config['data']['batch_size'], config['data']['num_points']))) input, target = iterator.get_next() training_init_op = iterator.make_initializer(trainds) validation_init_op = iterator.make_initializer(validds) with tf.device(device_str): # input, target = pointnet_seg.placeholder_inputs(config['data']['batch_size'], # config['data']['num_points'], # config['data']['num_features']) # handle = tf.compat.v1.placeholder(tf.string,shape=[]) # iterator = tf.compat.v1.data.Iterator.from_string_handle(handle,(tf.float32,tf.int32),((config['data']['batch_size'],config['data']['num_points'],config['data']['num_features']),(config['data']['batch_size'],config['data']['num_points']))) # input,target = iterator.get_next() # iter_train = trainds.make_one_shot_iterator() # iter_valid = validds.make_one_shot_iterator() is_training_pl = tf.compat.v1.placeholder(tf.bool, shape=()) batch = tf.Variable(0) pred, endpoints = pointnet_seg.get_model(input, is_training_pl, nclasses, dtype=dtype_input, bn=bn) loss = pointnet_seg.get_loss(pred, target, endpoints, dtype=dtype_input) tf.compat.v1.summary.scalar('loss/combined', loss) accuracy = pointnet_seg.get_accuracy(pred, target, dtype=dtype_input) tf.compat.v1.summary.scalar('accuracy/combined', accuracy) learning_rate = pointnet_seg.cyclic_learning_rate(batch * config['data']['batch_size'], config) tf.compat.v1.summary.scalar('learning_rate', learning_rate) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate) if hvd: optimizer = hvd.DistributedOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=batch) # grads_and_vars = optimizer.compute_gradients(loss, tf.trainable_variables()) # train = optimizer.apply_gradients(grads_and_vars) # Add ops to save and restore all the variables. saver = tf.train.Saver() merged = tf.compat.v1.summary.merge_all() logger.info('create session') config_proto = tf.compat.v1.ConfigProto() if 'gpu' in device_str: config_proto.gpu_options.allow_growth = True config_proto.gpu_options.visible_device_list = os.environ['CUDA_VISIBLE_DEVICES'] else: config_proto.allow_soft_placement = True config_proto.intra_op_parallelism_threads = args.intraop config_proto.inter_op_parallelism_threads = args.interop # Initialize an iterator over a dataset with 10 elements. sess = tf.compat.v1.Session(config=config_proto) if rank == 0: train_writer = tf.compat.v1.summary.FileWriter(os.path.join(args.logdir, 'train'), sess.graph) valid_writer = tf.compat.v1.summary.FileWriter(os.path.join(args.logdir, 'valid'), sess.graph) # train_handle = sess.run(iter_train.string_handle()) if args.restore: logger.info('restoring model: %s', args.restore) saver.restore(sess, args.restore) else: init = tf.compat.v1.global_variables_initializer() sess.run(init, {is_training_pl: True}) if hvd: sess.run(hvd.broadcast_global_variables(0)) logger.info('running over data') status_interval = config['training']['status'] total_acc = 0. loss_sum = 0. for epoch in range(config['training']['epochs']): logger.info('epoch %s of %s (logdir: %s)', epoch + 1, config['training']['epochs'], args.logdir) sess.run(training_init_op) start = time.time() while True: try: feed_dict = {is_training_pl: True} summary, step, _, loss_val, accuracy_val = sess.run([merged, batch, train_op, loss, accuracy], feed_dict=feed_dict) if rank == 0: train_writer.add_summary(summary, step) total_acc += accuracy_val loss_sum += loss_val # logger.info(f'pred_val.shape = {pred_val.shape}') # logger.info(f'target_val.shape = {target_val.shape}') if step % status_interval == 0: end = time.time() duration = end - start logger.info('step: %10d mean loss: %10.6f accuracy: %10.6f imgs/sec: %10.6f', step, loss_sum / float(status_interval), total_acc / float(status_interval), float(status_interval) * config['data']['batch_size'] / duration) start = time.time() total_acc = 0. loss_sum = 0. except tf.errors.OutOfRangeError: ss = saver.save(sess, os.path.join(args.logdir, "model.ckpt"), global_step=step) logger.info(' end of epoch: %s', ss) break sess.run(validation_init_op) logger.info('running validation') total_acc = 0. total_loss = 0. steps = 0. while True: try: feed_dict = {is_training_pl: False} summary, valid_step, loss_val, accuracy_val = sess.run([merged, batch, loss, accuracy], feed_dict=feed_dict) total_acc += accuracy_val total_loss += loss_val steps += 1. logger.info('valid step: %s', valid_step) if rank == 0: valid_writer.add_summary(summary, valid_step) except tf.errors.OutOfRangeError: total_loss = total_loss / steps total_acc = total_acc / steps logger.info(' end of validation mean loss: %10.6f mean acc: %10.6f', total_loss, total_acc) break
def main(): """ Main entry. """ print("pid %i: Hello" % os.getpid()) print("Python version:", sys.version) print("Env:") for key, value in sorted(os.environ.items()): print("%s=%s" % (key, value)) print() if os.environ.get("PE_HOSTFILE", ""): try: print("PE_HOSTFILE, %s:" % os.environ["PE_HOSTFILE"]) with open(os.environ["PE_HOSTFILE"], "r") as f: print(f.read()) except FileNotFoundError as exc: print(exc) if os.environ.get("SGE_JOB_SPOOL_DIR", ""): print("SGE_JOB_SPOOL_DIR, %s:" % os.environ["SGE_JOB_SPOOL_DIR"]) for name in os.listdir(os.environ["SGE_JOB_SPOOL_DIR"]): print(name) print() if os.environ.get("OMPI_FILE_LOCATION", ""): print("OMPI_FILE_LOCATION, %s:" % os.environ["OMPI_FILE_LOCATION"]) d = os.path.dirname(os.path.dirname(os.environ["OMPI_FILE_LOCATION"])) print("dir:", d) for name in os.listdir(d): print(name) print() print("contact.txt:") with open("%s/contact.txt" % d, "r") as f: print(f.read()) print() # https://github.com/horovod/horovod/issues/1123 try: import ctypes ctypes.CDLL("libhwloc.so", mode=ctypes.RTLD_GLOBAL) except Exception as exc: print("Exception while loading libhwloc.so, ignoring...", exc) print("sys.path:") i = 0 for p in list(sys.path): print(p) if "/.local/lib/" in p: # small workaround if the order is messed up... prefer from .local/lib print("(insert at position %i)" % i) sys.path.insert(i, p) i += 1 print() try: from mpi4py import MPI # noqa name = MPI.Get_processor_name() comm = MPI.COMM_WORLD print("mpi4py:", "name: %s," % name, "rank: %i," % comm.Get_rank(), "size: %i" % comm.Get_size()) hosts = comm.allgather( (comm.Get_rank(), name)) # Get the names of all the other hosts print(" all hosts:", {key: item for (key, item) in hosts}) except ImportError: print("mpi4py not available") print("Import TF now...") import tensorflow as tf print("TF version:", tf.__version__) import horovod # noqa print("Horovod version:", horovod.__version__) import horovod.tensorflow as hvd # noqa # Initialize Horovod hvd.init() print("pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" % (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
#!/usr/bin/env python3 import os print("pid %i: Hello" % os.getpid()) import tensorflow as tf import horovod.tensorflow as hvd # Initialize Horovod hvd.init() print("pid %i: hvd: rank: %i, size: %i, local_rank %i, local_size %i" % (os.getpid(), hvd.rank(), hvd.size(), hvd.local_rank(), hvd.local_size()))
def main(input_path, blocks, weights, image_dir, checkpoint_dir, trn_sz, learning_rate, loss_type, fs_type, opt_type, batch, batchnorm, num_epochs, dtype, chkpt, filter_sz, growth, disable_training, enable_tf_timeline): options = None run_metadata = None many_runs_timeline = None timeline_trace_fp = open("timeline_trace.pickle", "wb") options, run_metadata, many_runs_timeline, min_timeline_step, max_timeline_step = \ init_timeline_configs(enable_tf_timeline, tf.RunOptions.FULL_TRACE, -1, -1) global_time_logger = logger(-1, "Global Total Time", -1, True) global_time_logger.start_timer() #init horovod initialization_timer_logger = logger(-1, "Initialize Horovod", -1, True) initialization_timer_logger.start_timer() nvtx.RangePush("init horovod", 1) comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) nvtx.RangePop() # init horovod initialization_timer_logger.set_rank(int(comm_rank)) initialization_timer_logger.end_timer() global_time_logger.set_rank(int(comm_rank)) #parameters channels = [0, 1, 2, 10] per_rank_output = False loss_print_interval = 1 #session config initialization_timer_logger.start_timer(comm_rank, "Configure Session") sess_config = tf.ConfigProto( inter_op_parallelism_threads=6, #1 intra_op_parallelism_threads=1, #6 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) initialization_timer_logger.end_timer() #get data initialization_timer_logger.start_timer(comm_rank, "Get Data") training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") trn_data, val_data, tst_data = load_data(input_path, trn_sz, comm_rank) if comm_rank == 0: print("Shape of trn_data is {}".format(trn_data.shape[0])) print("done.") initialization_timer_logger.end_timer() #print some stats if comm_rank == 0: print("Learning Rate: {}".format(learning_rate)) print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Batch normalization: {}".format(batchnorm)) print("Blocks: {}".format(blocks)) print("Growth rate: {}".format(growth)) print("Filter size: {}".format(filter_sz)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Optimizer type: {}".format(opt_type)) print("Num training samples: {}".format(trn_data.shape[0])) print("Num validation samples: {}".format(val_data.shape[0])) io_training_time_logger = logger(comm_rank, "IO and Training", -1, True) io_training_time_logger.start_timer() with training_graph.as_default(): nvtx.RangePush("TF Init", 3) #create readers trn_reader = h5_input_reader(input_path, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, comm_rank=comm_rank) val_reader = h5_input_reader(input_path, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, comm_rank=comm_rank) #create datasets if fs_type == "local": trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_local_size, comm_local_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_size, comm_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype), ((batch, len(channels), image_height, image_width), (batch, image_height, image_width), (batch, image_height, image_width))) next_elem = iterator.get_next() #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #set up model logit, prediction = create_tiramisu(3, next_elem[0], image_height, image_width, len(channels), loss_weights=weights, nb_layers_per_block=blocks, p=0.2, wd=1e-4, dtype=dtype, batchnorm=batchnorm, growth_rate=growth, filter_sz=filter_sz, comm_rank=comm_rank) #set up loss labels_one_hot = tf.cast(tf.contrib.layers.one_hot_encoding( next_elem[1], 3), dtype=dtype) loss = None if loss_type == "weighted": loss = tf.losses.softmax_cross_entropy( onehot_labels=labels_one_hot, logits=logit, weights=next_elem[2]) elif loss_type == "focal": loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) if horovod: loss_avg = hvd.allreduce(tf.cast(loss, tf.float32)) else: loss_avg = tf.identity(loss) #set up global step global_step = tf.train.get_or_create_global_step() #set up optimizer if opt_type.startswith("LARC"): if comm_rank == 0: print("Enabling LARC") train_op = get_larc_optimizer(opt_type.split("-")[1], loss, global_step, learning_rate, LARC_mode="clip", LARC_eta=0.002, LARC_epsilon=1. / 16000.) else: train_op = get_optimizer(opt_type, loss, global_step, learning_rate) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1], predictions=tf.argmax( prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) if horovod: iou_avg = hvd.allreduce(iou_op) else: iou_avg = tf.identity(iou_op) #compute epochs and stuff: if fs_type == "local": num_samples = trn_data.shape[0] // comm_local_size else: num_samples = trn_data.shape[0] // comm_size #num_steps_per_epoch = num_samples // batch num_steps_per_epoch = 10 num_steps = num_epochs * num_steps_per_epoch if per_rank_output: print("Rank {} does {} steps per epoch".format( comm_rank, num_steps_per_epoch)) #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)] #bcast init for bcasting the model after start init_bcast = hvd.broadcast_global_variables(0) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = num_steps_per_epoch * 2 checkpoint_saver = tf.train.Saver(max_to_keep=1000) listener = checkpoint_listener(comm_rank, True) hooks.append( tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver, listeners=[listener])) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) ##DEBUG ##summary #if comm_rank == 0: # print("write graph for debugging") # tf.summary.scalar("loss",loss) # summary_op = tf.summary.merge_all() # #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op)) # with tf.Session(config=sess_config) as sess: # sess.run([init_op, init_local_op]) # #create iterator handles # trn_handle = sess.run(trn_handle_string) # #init iterators # sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels}) # #summary: # sess.run(summary_op, feed_dict={handle: trn_handle}) # #summary file writer # summary_writer = tf.summary.FileWriter('./logs', sess.graph) ##DEBUG #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: if comm_rank == 0: load_model(sess, checkpoint_saver, checkpoint_dir, comm_rank) #broadcast loaded model variables sess.run(init_bcast) #create iterator handles trn_handle, val_handle = sess.run( [trn_handle_string, val_handle_string], options=options, run_metadata=run_metadata) update_timeline_in_range(enable_tf_timeline, run_metadata, many_runs_timeline, "create_iterator_handle.json") #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range(enable_tf_timeline, run_metadata, many_runs_timeline, "init_train_iterator_handle.json") sess.run(val_init_op, feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range(enable_tf_timeline, run_metadata, many_runs_timeline, "init_val_iterator_handle.json") nvtx.RangePop() # TF Init # do the training epoch = 1 step = 1 train_loss = 0. nvtx.RangePush("Training Loop", 4) nvtx.RangePush("Epoch", epoch) start_time = time.time() training_loop_timer_logger = logger(comm_rank, "Training Loop", -1, True) training_loop_timer_logger.start_timer() train_steps = 0 while not (sess.should_stop()): #training loop try: training_iteration_time_logger = logger( comm_rank, "Training Iteration", epoch, True) training_iteration_time_logger.start_timer() nvtx.RangePush("Step", step) if disable_training: train_steps = sess.run([global_step], feed_dict={handle: trn_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, train_steps[0], "train_" + str(global_step) + ".json", min_timeline_step, max_timeline_step) train_steps_in_epoch = train_steps[ 0] % num_steps_per_epoch # do the validation phase if train_steps_in_epoch == 0: eval_steps = 0 while True: try: sess.run([next_elem[1]], feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, "val_dict" + str(eval_steps) + ".json") eval_steps += 1 except tf.errors.OutOfRangeError: sess.run(val_init_op, feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, "val_dict_out_" + str(eval_steps) + ".json") break else: # construct feed dict _, train_steps, tmp_loss = sess.run( [ train_op, global_step, (loss if per_rank_output else loss_avg) ], feed_dict={handle: trn_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, train_steps, "val_" + str(global_step) + ".json", min_timeline_step, max_timeline_step) if comm_rank == 0: step_trace_fp = open( "train_step_trace_" + str(global_step) + ".pickle", "wb") pickle.dump(run_metadata, step_trace_fp) train_steps_in_epoch = train_steps % num_steps_per_epoch train_loss += tmp_loss nvtx.RangePop() # Step step += 1 #print step report eff_steps = train_steps_in_epoch if ( train_steps_in_epoch > 0) else num_steps_per_epoch if (train_steps % loss_print_interval) == 0: if per_rank_output: print( "REPORT: rank {}, training loss for step {} (of {}) is {}, time {}" .format(comm_rank, train_steps, num_steps, train_loss / eff_steps, time.time() - start_time)) else: if comm_rank == 0: print( "REPORT: training loss for step {} (of {}) is {}, time {}" .format(train_steps, num_steps, train_loss / eff_steps, time.time() - start_time)) #do the validation phase if train_steps_in_epoch == 0: end_time = time.time() #print epoch report train_loss /= num_steps_per_epoch if per_rank_output: print( "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {} s" .format(comm_rank, epoch, num_epochs, train_loss, time.time() - start_time)) else: if comm_rank == 0: print( "COMPLETED: training loss for epoch {} (of {}) is {}, time {} s" .format(epoch, num_epochs, train_loss, time.time() - start_time)) #evaluation loop eval_loss = 0. eval_steps = 0 nvtx.RangePush("Eval Loop", 7) timeline_help_count = 0 while True: try: #construct feed dict _, tmp_loss, val_model_predictions, val_model_labels = sess.run( [ iou_update_op, (loss if per_rank_output else loss_avg), prediction, next_elem[1] ], feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, timeline_help_count, "train_" + str(global_step) + ".json", min_timeline_step, max_timeline_step) if comm_rank == 0: step_trace_fp = open( "validation_step_trace_" + str(global_step) + ".pickle", "wb") pickle.dump(run_metadata, step_trace_fp) timeline_help_count += 1 #print some images if comm_rank == 0: if have_imsave: imsave( image_dir + '/test_pred_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', np.argmax( val_model_predictions[0, ...], axis=2) * 100) imsave( image_dir + '/test_label_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', val_model_labels[0, ...] * 100) imsave( image_dir + '/test_combined_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', colormap[ val_model_labels[0, ...], np.argmax( val_model_predictions[ 0, ...], axis=2)]) else: np.save( image_dir + '/test_pred_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npy', np.argmax( val_model_predictions[0, ...], axis=2) * 100) np.save( image_dir + '/test_label_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npy', val_model_labels[0, ...] * 100) eval_loss += tmp_loss eval_steps += 1 except tf.errors.OutOfRangeError: eval_steps = np.max([eval_steps, 1]) eval_loss /= eval_steps if per_rank_output: print( "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, eval_loss)) else: if comm_rank == 0: print( "COMPLETED: evaluation loss for epoch {} (of {}) is {}" .format( epoch, num_epochs, eval_loss)) if per_rank_output: iou_score = sess.run(iou_op) print( "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, iou_score)) else: iou_score = sess.run(iou_avg) if comm_rank == 0: print( "COMPLETED: evaluation IoU for epoch {} (of {}) is {}" .format( epoch, num_epochs, iou_score)) sess.run(iou_reset_op) sess.run(val_init_op, feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, "train_" + str(global_step) + ".json") if comm_rank == 0: step_trace_fp = open( "validation_step_trace_out.pickle", "wb") pickle.dump(run_metadata, step_trace_fp) break nvtx.RangePop() # Eval Loop if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') # reset counters epoch += 1 train_loss = 0. step = 0 nvtx.RangePop() # Epoch nvtx.RangePush("Epoch", epoch) training_iteration_time_logger.end_timer() except tf.errors.OutOfRangeError: break nvtx.RangePop() # Epoch nvtx.RangePop() # Training Loop training_loop_timer_logger.end_timer() if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') io_training_time_logger.end_timer() global_time_logger.end_timer()
def finalize_configs(is_training): """ Run some sanity checks, and populate some configs from others """ _C.freeze(False) # populate new keys now _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1 # +1 background _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR) if isinstance(_C.DATA.VAL, six.string_types ): # support single string (the typical case) as well _C.DATA.VAL = (_C.DATA.VAL, ) assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN', 'None'], _C.BACKBONE.NORM if _C.BACKBONE.NORM != 'FreezeBN': assert not _C.BACKBONE.FREEZE_AFFINE assert _C.BACKBONE.FREEZE_AT in [0, 1, 2] _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS) assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES) # image size into the backbone has to be multiple of this number _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[ 3] # [3] because we build FPN with features r2,r3,r4,r5 if _C.MODE_FPN: size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1. _C.PREPROC.MAX_SIZE = np.ceil( _C.PREPROC.MAX_SIZE / size_mult) * size_mult assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint'] assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head') assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head') assert _C.FPN.NORM in ['None', 'GN'] if _C.FPN.CASCADE: # the first threshold is the proposal sampling threshold assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS) if is_training: train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE if isinstance( train_scales, (list, tuple)) and train_scales[1] - train_scales[0] > 100: # don't autotune if augmentation is on os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '1' assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER # setup NUM_GPUS if _C.TRAINER == 'horovod': import horovod.tensorflow as hvd ngpu = hvd.size() if ngpu == hvd.local_size(): logger.warn( "It's not recommended to use horovod for single-machine training. " "Replicated trainer is more stable and has the same efficiency." ) else: assert 'OMPI_COMM_WORLD_SIZE' not in os.environ ngpu = get_num_gpu() assert ngpu > 0, "Has to train with GPU!" assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format( ngpu) else: # autotune is too slow for inference os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' ngpu = get_num_gpu() if _C.TRAIN.NUM_GPUS is None: _C.TRAIN.NUM_GPUS = ngpu else: if _C.TRAINER == 'horovod': assert _C.TRAIN.NUM_GPUS == ngpu else: assert _C.TRAIN.NUM_GPUS <= ngpu _C.freeze() logger.info("Config: ------------------------------------------\n" + str(_C))
args, unknown = parser.parse_known_args() print(args) size, rank, local_size, local_rank = None, None, None, None if args.nccl: import horovod.tensorflow as dist else: import smdistributed.dataparallel.tensorflow as dist import smddpcommon as hm hm.setBucketSize(args.bucket_size * 1024 * 1024) dist.init() size = dist.size() rank = dist.rank() local_size = dist.local_size() local_rank = dist.local_rank() gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], 'GPU') if args.fp32: DTYPE, DTSIZE = tf.dtypes.float32, 4 else: DTYPE, DTSIZE = tf.dtypes.float16, 2 @tf.function