Esempio n. 1
0
    def test_no_bootstrap_script_if_not_needed(self):
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=False,
                                   bootstrap_python=False)

        runner._add_bootstrap_files_for_upload()
        self.assertIsNone(runner._master_bootstrap_script_path)
Esempio n. 2
0
    def test_bootstrap_mrjob_uses_python_bin(self):
        # use all the bootstrap options
        runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, python_bin=["anaconda"])

        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path, "r") as f:
            content = f.read()

        self.assertIn("sudo anaconda -m compileall -q -f", content)
Esempio n. 3
0
    def test_bootstrap_script_respects_sh_bin(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_bin',
                         return_value=['/bin/bash']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual(lines[0].strip(), '#!/bin/bash')
Esempio n. 4
0
    def test_bootstrap_script_respects_sh_bin(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_bin',
                         return_value=['/bin/bash']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual(lines[0].strip(), '#!/bin/bash')
Esempio n. 5
0
    def test_bootstrap_mrjob_uses_python_bin(self):
        # use all the bootstrap options
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=True,
                                   python_bin=['anaconda'])

        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path, 'r') as f:
            content = f.read()

        self.assertIn('sudo anaconda -m compileall -q -f', content)
Esempio n. 6
0
    def test_bootstrap_script_respects_sh_pre_commands(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_pre_commands',
                         return_value=['garply', 'quux']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual([line.strip() for line in lines[1:3]],
                         ['garply', 'quux'])
Esempio n. 7
0
    def test_usr_bin_env(self):
        runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, sh_bin="bash -e")

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], "#!/usr/bin/env bash -e")
Esempio n. 8
0
    def test_bootstrap_script_respects_sh_pre_commands(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_pre_commands',
                         return_value=['garply', 'quux']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual([line.strip() for line in lines[1:3]],
                         ['garply', 'quux'])
Esempio n. 9
0
    def test_usr_bin_env(self):
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=True,
                                   sh_bin='bash -e')

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/usr/bin/env bash -e')
Esempio n. 10
0
    def test_create_master_bootstrap_script(self):
        # create a fake src tarball
        foo_py_path = os.path.join(self.tmp_dir, 'foo.py')
        with open(foo_py_path, 'w'):
            pass

        runner = DataprocJobRunner(
            conf_paths=[],
            bootstrap=[
                PYTHON_BIN + ' ' + foo_py_path + '#bar.py',
                'gs://walrus/scripts/ohnoes.sh#', 'echo "Hi!"', 'true', 'ls',
                'speedups.sh', '/tmp/s.sh'
            ],
            bootstrap_mrjob=True)

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/bin/sh -ex')

        # check PWD gets stored
        self.assertIn('__mrjob_PWD=$PWD', lines)

        def assertScriptDownloads(path, name=None):
            uri = runner._upload_mgr.uri(path)
            name = runner._bootstrap_dir_mgr.name('file', path, name=name)

            self.assertIn(
                '  hadoop fs -copyToLocal %s $__mrjob_PWD/%s' % (uri, name),
                lines)
            self.assertIn('  chmod u+rx $__mrjob_PWD/%s' % (name, ), lines)

        # check files get downloaded
        assertScriptDownloads(foo_py_path, 'bar.py')
        assertScriptDownloads('gs://walrus/scripts/ohnoes.sh')
        assertScriptDownloads(runner._mrjob_zip_path)

        # check scripts get run

        # bootstrap
        self.assertIn('  ' + PYTHON_BIN + ' $__mrjob_PWD/bar.py', lines)
        self.assertIn('  $__mrjob_PWD/ohnoes.sh', lines)

        self.assertIn('  echo "Hi!"', lines)
        self.assertIn('  true', lines)
        self.assertIn('  ls', lines)

        self.assertIn('  speedups.sh', lines)
        self.assertIn('  /tmp/s.sh', lines)

        # bootstrap_mrjob
        mrjob_zip_name = runner._bootstrap_dir_mgr.name(
            'file', runner._mrjob_zip_path)
        self.assertIn(
            "  __mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from"
            " distutils.sysconfig import get_python_lib;"
            " print(get_python_lib())')", lines)
        self.assertIn(
            '  sudo unzip $__mrjob_PWD/' + mrjob_zip_name +
            ' -d $__mrjob_PYTHON_LIB', lines)
        self.assertIn(
            '  sudo ' + PYTHON_BIN + ' -m compileall -q -f'
            ' $__mrjob_PYTHON_LIB/mrjob && true', lines)
        # bootstrap_python
        if PY2:
            self.assertIn('  sudo apt-get install -y python-pip python-dev',
                          lines)
        else:
            self.assertIn(
                '  sudo apt-get install -y python3 python3-pip python3-dev',
                lines)
Esempio n. 11
0
    def test_no_bootstrap_script_if_not_needed(self):
        runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=False,
                              bootstrap_python=False)

        runner._add_bootstrap_files_for_upload()
        self.assertIsNone(runner._master_bootstrap_script_path)
Esempio n. 12
0
    def test_create_master_bootstrap_script(self):
        # create a fake src tarball
        foo_py_path = os.path.join(self.tmp_dir, 'foo.py')
        with open(foo_py_path, 'w'):
            pass

        # use all the bootstrap options
        runner = DataprocJobRunner(conf_paths=[],
                              bootstrap=[
                                  PYTHON_BIN + ' ' +
                                  foo_py_path + '#bar.py',
                                  'gs://walrus/scripts/ohnoes.sh#',
                                  # bootstrap_cmds
                                  'echo "Hi!"',
                                  'true',
                                  'ls',
                                  # bootstrap_scripts
                                  'speedups.sh',
                                  '/tmp/s.sh'
                              ],
                              bootstrap_mrjob=True)

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/bin/sh -ex')

        # check PWD gets stored
        self.assertIn('__mrjob_PWD=$PWD', lines)

        def assertScriptDownloads(path, name=None):
            uri = runner._upload_mgr.uri(path)
            name = runner._bootstrap_dir_mgr.name('file', path, name=name)

            self.assertIn(
                'hadoop fs -copyToLocal %s $__mrjob_PWD/%s' % (uri, name),
                lines)
            self.assertIn(
                'chmod a+x $__mrjob_PWD/%s' % (name,),
                lines)

        # check files get downloaded
        assertScriptDownloads(foo_py_path, 'bar.py')
        assertScriptDownloads('gs://walrus/scripts/ohnoes.sh')
        assertScriptDownloads(runner._mrjob_tar_gz_path)

        # check scripts get run

        # bootstrap
        self.assertIn(PYTHON_BIN + ' $__mrjob_PWD/bar.py', lines)
        self.assertIn('$__mrjob_PWD/ohnoes.sh', lines)

        self.assertIn('echo "Hi!"', lines)
        self.assertIn('true', lines)
        self.assertIn('ls', lines)

        self.assertIn('speedups.sh', lines)
        self.assertIn('/tmp/s.sh', lines)

        # bootstrap_mrjob
        mrjob_tar_gz_name = runner._bootstrap_dir_mgr.name(
            'file', runner._mrjob_tar_gz_path)
        self.assertIn("__mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from"
                      " distutils.sysconfig import get_python_lib;"
                      " print(get_python_lib())')", lines)
        self.assertIn('sudo tar xfz $__mrjob_PWD/' + mrjob_tar_gz_name +
                      ' -C $__mrjob_PYTHON_LIB', lines)
        self.assertIn('sudo ' + PYTHON_BIN + ' -m compileall -f'
                      ' $__mrjob_PYTHON_LIB/mrjob && true', lines)
        # bootstrap_python
        if PY2:
            self.assertIn('sudo apt-get install -y python-pip python-dev', lines)
        else:
            self.assertIn('sudo apt-get install -y python3 python3-pip python3-dev', lines)
Esempio n. 13
0
    def test_create_master_bootstrap_script(self):
        # create a fake src tarball
        foo_py_path = os.path.join(self.tmp_dir, "foo.py")
        with open(foo_py_path, "w"):
            pass

        # use all the bootstrap options
        runner = DataprocJobRunner(
            conf_paths=[],
            bootstrap=[
                PYTHON_BIN + " " + foo_py_path + "#bar.py",
                "gs://walrus/scripts/ohnoes.sh#",
                # bootstrap_cmds
                'echo "Hi!"',
                "true",
                "ls",
                # bootstrap_scripts
                "speedups.sh",
                "/tmp/s.sh",
            ],
            bootstrap_mrjob=True,
        )

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], "#!/bin/sh -ex")

        # check PWD gets stored
        self.assertIn("__mrjob_PWD=$PWD", lines)

        def assertScriptDownloads(path, name=None):
            uri = runner._upload_mgr.uri(path)
            name = runner._bootstrap_dir_mgr.name("file", path, name=name)

            self.assertIn("hadoop fs -copyToLocal %s $__mrjob_PWD/%s" % (uri, name), lines)
            self.assertIn("chmod a+x $__mrjob_PWD/%s" % (name,), lines)

        # check files get downloaded
        assertScriptDownloads(foo_py_path, "bar.py")
        assertScriptDownloads("gs://walrus/scripts/ohnoes.sh")
        assertScriptDownloads(runner._mrjob_zip_path)

        # check scripts get run

        # bootstrap
        self.assertIn(PYTHON_BIN + " $__mrjob_PWD/bar.py", lines)
        self.assertIn("$__mrjob_PWD/ohnoes.sh", lines)

        self.assertIn('echo "Hi!"', lines)
        self.assertIn("true", lines)
        self.assertIn("ls", lines)

        self.assertIn("speedups.sh", lines)
        self.assertIn("/tmp/s.sh", lines)

        # bootstrap_mrjob
        mrjob_zip_name = runner._bootstrap_dir_mgr.name("file", runner._mrjob_zip_path)
        self.assertIn(
            "__mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from"
            " distutils.sysconfig import get_python_lib;"
            " print(get_python_lib())')",
            lines,
        )
        self.assertIn("sudo unzip $__mrjob_PWD/" + mrjob_zip_name + " -d $__mrjob_PYTHON_LIB", lines)
        self.assertIn("sudo " + PYTHON_BIN + " -m compileall -q -f" " $__mrjob_PYTHON_LIB/mrjob && true", lines)
        # bootstrap_python
        if PY2:
            self.assertIn("sudo apt-get install -y python-pip python-dev", lines)
        else:
            self.assertIn("sudo apt-get install -y python3 python3-pip python3-dev", lines)