Esempio n. 1
0
    def test_cross_region_explicit_tmp_uri(self):
        self._make_bucket('walrus', EU_WEST_GCE_REGION)

        runner = DataprocJobRunner(region=US_EAST_GCE_REGION,
                                   cloud_tmp_dir='gs://walrus/tmp/')

        self.assertEqual(runner._cloud_tmp_dir, 'gs://walrus/tmp/')

        # tmp bucket shouldn't influence region (it did in 0.4.x)
        self.assertEqual(runner._gce_region, US_EAST_GCE_REGION)
Esempio n. 2
0
 def _test_mode(self, mode):
     r = DataprocJobRunner(conf_paths=[])
     with patch.multiple(r,
                         _cleanup_cluster=mock.DEFAULT,
                         _cleanup_job=mock.DEFAULT,
                         _cleanup_local_tmp=mock.DEFAULT,
                         _cleanup_logs=mock.DEFAULT,
                         _cleanup_cloud_tmp=mock.DEFAULT) as mock_dict:
         r.cleanup(mode=mode)
         yield mock_dict
Esempio n. 3
0
    def test_gcs_cat(self):
        self.put_gcs_multi({
            'gs://walrus/one': b'one_text',
            'gs://walrus/two': b'two_text',
            'gs://walrus/three': b'three_text',
        })

        runner = DataprocJobRunner(cloud_tmp_dir='gs://walrus/tmp',
                                   conf_paths=[])

        self.assertEqual(list(runner.fs.cat('gs://walrus/one')), [b'one_text'])
Esempio n. 4
0
    def test_bootstrap_script_respects_sh_bin(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_bin',
                         return_value=['/bin/bash']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual(lines[0].strip(), '#!/bin/bash')
Esempio n. 5
0
    def test_bootstrap_mrjob_uses_python_bin(self):
        # use all the bootstrap options
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=True,
                                   python_bin=['anaconda'])

        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path, 'r') as f:
            content = f.read()

        self.assertIn('sudo anaconda -m compileall -q -f', content)
Esempio n. 6
0
    def test_bootstrap_script_respects_sh_pre_commands(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_pre_commands',
                         return_value=['garply', 'quux']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual([line.strip() for line in lines[1:3]],
                         ['garply', 'quux'])
Esempio n. 7
0
    def test_command_line_beats_config(self):
        ZONE_CONF = dict(runners=dict(dataproc=dict(zone='us-west1-a')))

        with mrjob_conf_patcher(ZONE_CONF):
            runner = DataprocJobRunner(region='europe-west1')

            # region takes precedence because it was set on the command line
            self.assertEqual(runner._opts['region'], 'europe-west1')
            self.assertEqual(runner._opts['zone'], None)
            # only a problem if you set region and zone
            # in the same config
            self.assertFalse(self.log.warning.called)
Esempio n. 8
0
    def test_usr_bin_env(self):
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=True,
                                   sh_bin='bash -e')

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/usr/bin/env bash -e')
Esempio n. 9
0
    def test_no_bootstrap_script_if_not_needed(self):
        runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=False,
                                   bootstrap_python=False)

        runner._add_bootstrap_files_for_upload()
        self.assertIsNone(runner._master_bootstrap_script_path)
Esempio n. 10
0
    def test_create_master_bootstrap_script(self):
        # create a fake src tarball
        foo_py_path = os.path.join(self.tmp_dir, 'foo.py')
        with open(foo_py_path, 'w'):
            pass

        runner = DataprocJobRunner(
            conf_paths=[],
            bootstrap=[
                PYTHON_BIN + ' ' + foo_py_path + '#bar.py',
                'gs://walrus/scripts/ohnoes.sh#', 'echo "Hi!"', 'true', 'ls',
                'speedups.sh', '/tmp/s.sh'
            ],
            bootstrap_mrjob=True)

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/bin/sh -ex')

        # check PWD gets stored
        self.assertIn('__mrjob_PWD=$PWD', lines)

        def assertScriptDownloads(path, name=None):
            uri = runner._upload_mgr.uri(path)
            name = runner._bootstrap_dir_mgr.name('file', path, name=name)

            self.assertIn(
                '  hadoop fs -copyToLocal %s $__mrjob_PWD/%s' % (uri, name),
                lines)
            self.assertIn('  chmod u+rx $__mrjob_PWD/%s' % (name, ), lines)

        # check files get downloaded
        assertScriptDownloads(foo_py_path, 'bar.py')
        assertScriptDownloads('gs://walrus/scripts/ohnoes.sh')
        assertScriptDownloads(runner._mrjob_zip_path)

        # check scripts get run

        # bootstrap
        self.assertIn('  ' + PYTHON_BIN + ' $__mrjob_PWD/bar.py', lines)
        self.assertIn('  $__mrjob_PWD/ohnoes.sh', lines)

        self.assertIn('  echo "Hi!"', lines)
        self.assertIn('  true', lines)
        self.assertIn('  ls', lines)

        self.assertIn('  speedups.sh', lines)
        self.assertIn('  /tmp/s.sh', lines)

        # bootstrap_mrjob
        mrjob_zip_name = runner._bootstrap_dir_mgr.name(
            'file', runner._mrjob_zip_path)
        self.assertIn(
            "  __mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from"
            " distutils.sysconfig import get_python_lib;"
            " print(get_python_lib())')", lines)
        self.assertIn(
            '  sudo unzip $__mrjob_PWD/' + mrjob_zip_name +
            ' -d $__mrjob_PYTHON_LIB', lines)
        self.assertIn(
            '  sudo ' + PYTHON_BIN + ' -m compileall -q -f'
            ' $__mrjob_PYTHON_LIB/mrjob && true', lines)
        # bootstrap_python
        if PY2:
            self.assertIn('  sudo apt-get install -y python-pip python-dev',
                          lines)
        else:
            self.assertIn(
                '  sudo apt-get install -y python3 python3-pip python3-dev',
                lines)
Esempio n. 11
0
    def test_explicit_tmp_uri(self):
        self._make_bucket('walrus', US_EAST_GCE_REGION)

        runner = DataprocJobRunner(cloud_tmp_dir='gs://walrus/tmp/')

        self.assertEqual(runner._cloud_tmp_dir, 'gs://walrus/tmp/')
Esempio n. 12
0
    def test_reuse_mrjob_bucket_in_same_region(self):
        self._make_bucket('mrjob-1', DEFAULT_GCE_REGION)

        runner = DataprocJobRunner()
        self.assertEqual(runner._cloud_tmp_dir, 'gs://mrjob-1/tmp/')
Esempio n. 13
0
 def test_cannot_be_empty(self):
     runner = DataprocJobRunner(region='')
     self.assertEqual(runner._gce_region, 'us-central1')
Esempio n. 14
0
 def test_explicit_region(self):
     runner = DataprocJobRunner(region='europe-west1')
     self.assertEqual(runner._gce_region, 'europe-west1')
Esempio n. 15
0
 def test_default(self):
     runner = DataprocJobRunner()
     self.assertEqual(runner._gce_region, 'us-central1')
Esempio n. 16
0
 def test_default(self):
     runner = DataprocJobRunner()
     self.assertEqual(runner._opts['region'], 'us-west1')
     self.assertEqual(runner._opts['zone'], None)
     self.assertFalse(self.log.warning.called)
Esempio n. 17
0
 def test_explicit_zone(self):
     runner = DataprocJobRunner(zone='europe-west1-a')
     self.assertEqual(runner._opts['zone'], 'europe-west1-a')
Esempio n. 18
0
    def test_explicit_zone_beats_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b'
            runner = DataprocJobRunner(zone='europe-west1-a')

        self.assertEqual(runner._opts['zone'], 'europe-west1-a')
Esempio n. 19
0
    def test_zone_from_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b'
            runner = DataprocJobRunner()

        self.assertEqual(runner._opts['zone'], 'us-west1-b')
Esempio n. 20
0
    def test_explicit_region_beats_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1'
            runner = DataprocJobRunner(region='europe-west1-a')

        self.assertEqual(runner._opts['region'], 'europe-west1-a')
Esempio n. 21
0
    def test_region_from_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1'
            runner = DataprocJobRunner()

        self.assertEqual(runner._opts['region'], 'us-east1')
Esempio n. 22
0
 def _quick_runner(self):
     r = DataprocJobRunner(conf_paths=[])
     r._cluster_id = 'j-ESSEOWENS'
     r._ran_job = False
     return r
Esempio n. 23
0
 def setUp(self):
     super(GetNewDriverOutputLinesTestCase, self).setUp()
     self.runner = DataprocJobRunner()
     self.fs = self.runner.fs