Exemple #1
0
def PrepareHorovod(vm):
    """Install dependencies on a single vm.

  Args:
    vm: vm to operate on
  """
    logging.info('Installing Horovod on %s', vm)
    vm.AuthenticateVm()

    vm.Install('google_cloud_sdk')
    vm.Install('openmpi')
    vm.InstallPackages('wget git unzip')
    vm.Install('nccl')

    pip = 'pip'
    if FLAGS.cloud == 'GCP':
        pip = '/opt/conda/bin/pip'
        vm.RemoteCommand(f'sudo {pip} install --force-reinstall pyarrow')
    elif FLAGS.cloud == 'AWS':
        vm.RobustRemoteCommand('. anaconda3/bin/activate tensorflow_p37')
        pip = 'anaconda3/envs/tensorflow_p37/bin/pip'

    # 10.0 -> 110
    cuda_version = cuda_toolkit.GetCudaToolkitVersion(vm).replace('.', '')
    vm.RemoteCommand(
        f'sudo {pip} install '
        '--extra-index-url https://developer.download.nvidia.com/compute/redist/ '
        'git+https://github.com/NVIDIA/dllogger.git '
        f'nvidia-dali-cuda{cuda_version}')

    vm.RemoteCommand(
        f'sudo {pip} install '
        '--extra-index-url https://developer.download.nvidia.com/compute/redist/ '
        f'nvidia-dali-tf-plugin-cuda{cuda_version}')
    vm.RemoteCommand(f'sudo {pip} uninstall -y horovod')
    vm.RemoteCommand(
        f'sudo HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_MPI=1 {pip} install -U --no-cache horovod'
    )
    vm.RemoteCommand(
        f'sudo {pip} install pynvml cython scipy \'opencv-python==3.4.2.17\'')
    vm.RemoteCommand(
        f'sudo {pip} install \'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI\''
    )
    vm.RemoteCommand(
        f'[ -d "tensorpack" ] || git clone https://github.com/tensorpack/tensorpack.git && sudo {pip} install ./tensorpack'
    )

    _CopyAndUpdateRunScripts(FLAGS.horovod_model, vm)
Exemple #2
0
def _PrepareHorovod(vm):
    """Install dependencies on a single vm.

  Args:
    vm: vm to operate on
  """
    logging.info('Installing Horovod on %s', vm)
    vm.AuthenticateVm()

    vm.Install('google_cloud_sdk')
    vm.InstallPackages('wget git unzip')
    vm.Install('nccl')

    pip = 'pip'
    if FLAGS.cloud == 'GCP':  # temporary fix for DLVM images
        pip = '/opt/conda/bin/pip'
        vm.RemoteCommand(f'sudo {pip} install --force-reinstall pyarrow')
        vm.Install('openmpi')
    elif FLAGS.cloud == 'AWS':
        vm.RobustRemoteCommand('. anaconda3/bin/activate tensorflow_p36')
        pip = 'anaconda3/envs/tensorflow_p36/bin/pip'

    # 10.0 -> 110
    cuda_version = cuda_toolkit.GetCudaToolkitVersion(vm).replace('.', '')
    vm.RemoteCommand(
        f'sudo {pip} install '
        '--extra-index-url https://developer.download.nvidia.com/compute/redist/ '
        'git+https://github.com/NVIDIA/dllogger.git '
        f'nvidia-dali-cuda{cuda_version}')

    vm.RemoteCommand(
        f'sudo {pip} install '
        '--extra-index-url https://developer.download.nvidia.com/compute/redist/ '
        f'nvidia-dali-tf-plugin-cuda{cuda_version}')

    vm.RemoteCommand(
        f'sudo {pip} install cython scipy \'opencv-python==3.4.2.17\'')
    vm.RemoteCommand(
        f'sudo {pip} install \'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI\''
    )
    vm.RemoteCommand(
        f'[ -d "tensorpack" ] || git clone https://github.com/tensorpack/tensorpack.git && sudo {pip} install ./tensorpack'
    )