def train_and_evaluate(data_path, total_epochs, gpu_count=1): # Create distributed communicator for 1-bit SGD for better scaling to multiple GPUs # If you'd like to avoid quantization loss, use simple one instead quantization_bit = 1 if (quantization_bit == 32): communicator = distributed.mpi_communicator() else: communicator = distributed.quantized_mpi_communicator(quantization_bit) workers = communicator.workers() current_worker = communicator.current_worker() print("List all distributed workers") for wk in workers: if current_worker.global_rank == wk.global_rank: print("* {} {}".format(wk.global_rank, wk.host_id)) else: print(" {} {}".format(wk.global_rank, wk.host_id)) if gpu_count == 1 and len(workers) > 1: print("Warning: running distributed training on 1-GPU will be slow") device.set_default_device(gpu(0)) print("Training on device type:{} id:{}".format( 'gpu' if device.default().type() else 'cpu', device.default().id())) start_model = "start_model.bin" num_start_epochs = 1 num_parallel_epochs = total_epochs - num_start_epochs # training the start model only in one worker if communicator.current_worker().global_rank == 0: cifar_resnet_distributed(data_path, save_model_filename=start_model, communicator=None, run_test=False, num_epochs=num_start_epochs) communicator.barrier() # train in parallel error = cifar_resnet_distributed(data_path, load_model_filename=start_model, communicator=communicator, run_test=True, num_epochs=num_parallel_epochs) distributed.Communicator.finalize() return error
def test_get_data_type(): assert get_data_type(constant(value=2), constant(value=1)) == np.float32 assert get_data_type(input_variable(shape=(2,3)), constant(value=1)) == np.float32 ndav32 = create_NDArrayView_from_NumPy(np.asarray([[1,2]], dtype=np.float32)) assert get_data_type(input_variable(shape=(2,3), data_type=np.float64), ndav32) == np.float64 ndav64 = create_NDArrayView_from_NumPy(np.asarray([[1,2]], dtype=np.float64)) assert get_data_type(input_variable(shape=(2,3), data_type=np.float64), ndav64) == np.float64 val32 = create_Value_from_NumPy(np.asarray([[1,2]], dtype=np.float32), dev = default()) assert get_data_type(val32, ndav64) == np.float64
def test_get_data_type(): assert get_data_type(constant(value=2), constant(value=1)) == np.float32 assert get_data_type(input_variable(shape=(2, 3)), constant(value=1)) == np.float32 ndav32 = create_NDArrayView_from_NumPy( np.asarray([[1, 2]], dtype=np.float32)) assert get_data_type(input_variable(shape=(2, 3), data_type=np.float64), ndav32) == np.float64 ndav64 = create_NDArrayView_from_NumPy( np.asarray([[1, 2]], dtype=np.float64)) assert get_data_type(input_variable(shape=(2, 3), data_type=np.float64), ndav64) == np.float64 val32 = create_Value_from_NumPy(np.asarray([[1, 2]], dtype=np.float32), dev=default()) assert get_data_type(val32, ndav64) == np.float64
def train_and_evaluate(data_path, total_epochs, gpu_count=1): # Create distributed communicator for 1-bit SGD for better scaling to multiple GPUs # If you'd like to avoid quantization loss, use simple one instead quantization_bit = 1 if (quantization_bit == 32): communicator = distributed.mpi_communicator() else: communicator = distributed.quantized_mpi_communicator(quantization_bit) workers = communicator.workers() current_worker = communicator.current_worker() print("List all distributed workers") for wk in workers: if current_worker.global_rank == wk.global_rank: print("* {} {}".format(wk.global_rank, wk.host_id)) else: print(" {} {}".format(wk.global_rank, wk.host_id)) if gpu_count == 1 and len(workers) > 1 : print("Warning: running distributed training on 1-GPU will be slow") device.set_default_device(gpu(0)) print("Training on device type:{} id:{}".format('gpu' if device.default().type() else 'cpu', device.default().id())) start_model = "start_model.bin" num_start_epochs = 1 num_parallel_epochs = total_epochs - num_start_epochs # training the start model only in one worker if communicator.current_worker().global_rank == 0: cifar_resnet_distributed(data_path, save_model_filename=start_model, communicator=None, run_test=False, num_epochs=num_start_epochs) communicator.barrier() # train in parallel error = cifar_resnet_distributed(data_path, load_model_filename=start_model, communicator=communicator, run_test=True, num_epochs=num_parallel_epochs) distributed.Communicator.finalize() return error
*"../../../../Examples/Image/DataSets/CIFAR-10/".split("/")))) os.chdir(data_path) # Create distributed communicator for 1-bit SGD communicator = distributed.communicator(distributed.quantized_mpi_communicator(1)) workers = communicator.workers() current_worker = communicator.current_worker() print("List all distributed workers") for wk in workers: if current_worker.global_rank == wk.global_rank: print("* {} {}".format(wk.global_rank, wk.host_id)) else: print(" {} {}".format(wk.global_rank, wk.host_id)) print("Training on device type:{} id:{}".format('gpu' if device.default().type() else 'cpu', device.default().id())) start_model = "start_model.bin" num_start_epochs = 1 num_parallel_epochs = 10 # training the start model only in one worker if communicator.current_worker().global_rank == 0: cifar_resnet(data_path, save_model_filename=start_model, communicator=None, run_test=False, num_epochs=num_start_epochs) communicator.barrier() # train in parallel error = cifar_resnet(data_path, load_model_filename=start_model, communicator=communicator, run_test=True, num_epochs=num_parallel_epochs) print("Error: %f" % error)
os.chdir(data_path) # Create distributed communicator for 1-bit SGD communicator = distributed.communicator( distributed.quantized_mpi_communicator(1)) workers = communicator.workers() current_worker = communicator.current_worker() print("List all distributed workers") for wk in workers: if current_worker.global_rank == wk.global_rank: print("* {} {}".format(wk.global_rank, wk.host_id)) else: print(" {} {}".format(wk.global_rank, wk.host_id)) print("Training on device type:{} id:{}".format( 'gpu' if device.default().type() else 'cpu', device.default().id())) start_model = "start_model.bin" num_start_epochs = 1 num_parallel_epochs = 10 # training the start model only in one worker if communicator.current_worker().global_rank == 0: cifar_resnet(data_path, save_model_filename=start_model, communicator=None, run_test=False, num_epochs=num_start_epochs) communicator.barrier()