def main(epochs, buffer_size, batch_size, train_mode, distribution_strategy, num_gpus, workers, w_type, w_index): strategy = get_distribution_strategy(strategy=distribution_strategy, num_gpus=num_gpus, workers=workers, typ=w_type, index=w_index) print_msg ('Number of devices: {}'.format(strategy.num_replicas_in_sync), 'info') data_obj = Dataset(batch_size=128) train_dataset, test_dataset = data_obj.create_dataset() steps_per_epoch = data_obj.get_buffer_size()//(batch_size) train_obj = Benchmark(epochs, steps_per_epoch, 'resnet56') with strategy.scope(): # Create and compile model within strategy scope train_obj.create_model('resnet56') train_obj.compile_model() print_msg('Training...', 'info') train_obj.run(train_dataset, test_dataset, train_mode) print_msg('Training Done.', 'succ')
def main(epochs, buffer_size, batch_size, train_mode, display_every, distribution_strategy, num_gpus, workers, w_type, w_index, setup_cluster, verbose): if verbose: os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(verbose) strategy = get_distribution_strategy(strategy=distribution_strategy, train_mode=train_mode, num_gpus=num_gpus, workers=workers, typ=w_type, index=w_index, setup=setup_cluster) if num_gpus == 1: num_gpus = strategy.num_replicas_in_sync print_msg('Number of devices: {}'.format(num_gpus), 'info') data_obj = Dataset(batch_size) train_dataset, test_dataset = data_obj.create_dataset() steps_per_epoch = data_obj.get_buffer_size() // (batch_size) train_obj = Benchmark(epochs, steps_per_epoch, batch_size, display_every, num_gpus, 'resnet56', strategy) print_msg('Training...', 'info') train_obj.run(train_dataset, test_dataset, train_mode) print_msg('Training Done.', 'succ')