Example #1
0
    def assert_new_tmp_bucket(self, location, **runner_kwargs):
        """Assert that if we create an DataprocJobRunner with the given keyword
        args, it'll create a new tmp bucket with the given location
        constraint.
        """
        bucket_cache = self._gcs_client._cache_buckets

        existing_buckets = set(bucket_cache.keys())

        runner = DataprocJobRunner(conf_paths=[], **runner_kwargs)

        bucket_name, path = parse_gcs_uri(runner._cloud_tmp_dir)
        runner._create_fs_tmp_bucket(bucket_name, location=location)

        self.assertTrue(bucket_name.startswith('mrjob-'))
        self.assertNotIn(bucket_name, existing_buckets)
        self.assertEqual(path, 'tmp/')

        current_bucket = bucket_cache[bucket_name]
        self.assertEqual(current_bucket['location'], location)

        # Verify that we setup bucket lifecycle rules of 28-day retention
        first_lifecycle_rule = current_bucket['lifecycle']['rule'][0]
        self.assertEqual(first_lifecycle_rule['action'], dict(type='Delete'))
        self.assertEqual(first_lifecycle_rule['condition'], dict(age=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS))
Example #2
0
    def test_attach_to_existing_cluster(self):
        runner = DataprocJobRunner(conf_paths=[])

        cluster_body = runner.api_client.cluster_create()
        cluster_id = cluster_body['clusterName']

        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRTwoStepJob(
            ['-r', 'dataproc', '-v', '--cluster-id', cluster_id])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            # Generate fake output
            self.put_job_output_parts(runner,
                                      [b'1\t"bar"\n1\t"foo"\n2\tnull\n'])

            # Issue 182: don't create the bootstrap script when
            # attaching to another cluster
            self.assertIsNone(runner._master_bootstrap_script_path)

            results.extend(mr_job.parse_output(runner.cat_output()))

        self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
Example #3
0
    def test_attach_to_existing_cluster(self):
        runner = DataprocJobRunner(conf_paths=[])

        cluster_body = runner.api_client.cluster_create()
        cluster_id = cluster_body['clusterName']

        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
                               '--cluster-id', cluster_id])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            # Generate fake output
            self.put_job_output_parts(runner, [
                b'1\t"bar"\n1\t"foo"\n2\tnull\n'
            ])

            # Issue 182: don't create the bootstrap script when
            # attaching to another cluster
            self.assertIsNone(runner._master_bootstrap_script_path)

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        self.assertEqual(sorted(results),
                         [(1, 'bar'), (1, 'foo'), (2, None)])
Example #4
0
    def test_no_bootstrap_script_if_not_needed(self):
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=False,
                                   bootstrap_python=False)

        runner._add_bootstrap_files_for_upload()
        self.assertIsNone(runner._master_bootstrap_script_path)
Example #5
0
    def test_dont_take_down_cluster_on_failure(self):
        runner1 = DataprocJobRunner(conf_paths=[])

        runner1._launch_cluster()
        cluster_id = runner1._cluster_id

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
                               '--cluster-id', cluster_id])
        mr_job.sandbox()

        self.mock_jobs_succeed = False

        with mr_job.make_runner() as runner2:
            self.assertIsInstance(runner2, DataprocJobRunner)

            with logger_disabled('mrjob.dataproc'):
                self.assertRaises(StepFailedException, runner2.run)

            cluster2 = runner2._get_cluster(runner2._cluster_id)
            self.assertEqual(_cluster_state_name(cluster2.status.state),
                             'RUNNING')

        # job shouldn't get terminated by cleanup
        cluster1 = runner1._get_cluster(runner1._cluster_id)
        self.assertEqual(_cluster_state_name(cluster1.status.state),
                         'RUNNING')
Example #6
0
    def _test_instance_groups(self, opts, **kwargs):
        """Run a job with the given option dictionary, and check for
        for instance, number, and optional bid price for each instance role.

        Specify expected instance group info like:

        <role>=(num_instances, instance_type, bid_price)
        """
        runner = DataprocJobRunner(**opts)

        # cluster_body = runner.api_client.cluster_create()
        fake_bootstrap_script = 'gs://fake-bucket/fake-script.sh'
        runner._master_bootstrap_script_path = fake_bootstrap_script
        runner._upload_mgr.add(fake_bootstrap_script)
        runner._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)

        cluster_id = runner._launch_cluster()

        cluster = runner._get_cluster(cluster_id)

        conf = cluster.config

        role_to_actual = dict(
            master=self._gce_instance_group_summary(conf.master_config),
            core=self._gce_instance_group_summary(conf.worker_config),
            task=self._gce_instance_group_summary(conf.secondary_worker_config)
        )

        role_to_expected = kwargs.copy()
        role_to_expected.setdefault('master', (1, DEFAULT_GCE_INSTANCE))
        role_to_expected.setdefault('core', (2, DEFAULT_GCE_INSTANCE))
        role_to_expected.setdefault(
            'task', self._gce_instance_group_summary(dict()))
        self.assertEqual(role_to_actual, role_to_expected)
Example #7
0
    def _test_instance_groups(self, opts, **kwargs):
        """Run a job with the given option dictionary, and check for
        for instance, number, and optional bid price for each instance role.

        Specify expected instance group info like:

        <role>=(num_instances, instance_type, bid_price)
        """
        runner = DataprocJobRunner(**opts)

        # cluster_body = runner.api_client.cluster_create()
        fake_bootstrap_script = 'gs://fake-bucket/fake-script.sh'
        runner._master_bootstrap_script_path = fake_bootstrap_script
        runner._upload_mgr.add(fake_bootstrap_script)
        runner._upload_mgr.add(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH)

        cluster_id = runner._launch_cluster()

        cluster_body = runner._api_cluster_get(cluster_id)

        conf = cluster_body['config']

        role_to_actual = dict(
            master=self._gce_instance_group_summary(conf['masterConfig']),
            core=self._gce_instance_group_summary(conf['workerConfig']),
            task=self._gce_instance_group_summary(conf.get('secondaryWorkerConfig'))
        )

        role_to_expected = kwargs.copy()
        role_to_expected.setdefault('master', (1, DEFAULT_GCE_INSTANCE))
        role_to_expected.setdefault('core', (2, DEFAULT_GCE_INSTANCE))
        role_to_expected.setdefault('task', self._gce_instance_group_summary(dict()))
        self.assertEqual(role_to_actual, role_to_expected)
Example #8
0
    def assert_new_tmp_bucket(self, location, **runner_kwargs):
        """Assert that if we create an DataprocJobRunner with the given keyword
        args, it'll create a new tmp bucket with the given location
        constraint.
        """
        existing_buckets = set(self.mock_gcs_fs)

        runner = DataprocJobRunner(conf_paths=[], **runner_kwargs)

        bucket_name, path = parse_gcs_uri(runner._cloud_tmp_dir)
        runner._create_fs_tmp_bucket(bucket_name, location=location)

        self.assertTrue(bucket_name.startswith('mrjob-'))
        self.assertNotIn(bucket_name, existing_buckets)
        self.assertEqual(path, 'tmp/')

        current_bucket = runner.fs.get_bucket(bucket_name)

        self.assertEqual(current_bucket.location, location.upper())

        # Verify that we setup bucket lifecycle rules of 28-day retention
        first_lifecycle_rule = current_bucket.lifecycle_rules[0]
        self.assertEqual(first_lifecycle_rule['action'], dict(type='Delete'))
        self.assertEqual(first_lifecycle_rule['condition'],
                         dict(age=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS))
Example #9
0
    def setUp(self):
        super(UpdateStepInterpretationTestCase, self).setUp()
        self.runner = DataprocJobRunner()
        self.get_lines = self.start(
            patch(
                'mrjob.dataproc.DataprocJobRunner._get_new_driver_output_lines',
                return_value=[]))

        self.step_interpretation = {}
Example #10
0
    def test_cross_region_explicit_tmp_uri(self):
        self._make_bucket('walrus', EU_WEST_GCE_REGION)

        runner = DataprocJobRunner(region=US_EAST_GCE_REGION,
                                   cloud_tmp_dir='gs://walrus/tmp/')

        self.assertEqual(runner._cloud_tmp_dir, 'gs://walrus/tmp/')

        # tmp bucket shouldn't influence region (it did in 0.4.x)
        self.assertEqual(runner._region(), US_EAST_GCE_REGION)
Example #11
0
 def _test_mode(self, mode):
     r = DataprocJobRunner(conf_paths=[])
     with patch.multiple(r,
                         _cleanup_cluster=mock.DEFAULT,
                         _cleanup_job=mock.DEFAULT,
                         _cleanup_local_tmp=mock.DEFAULT,
                         _cleanup_logs=mock.DEFAULT,
                         _cleanup_cloud_tmp=mock.DEFAULT) as mock_dict:
         r.cleanup(mode=mode)
         yield mock_dict
Example #12
0
    def test_bootstrap_mrjob_uses_python_bin(self):
        # use all the bootstrap options
        runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, python_bin=["anaconda"])

        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path, "r") as f:
            content = f.read()

        self.assertIn("sudo anaconda -m compileall -q -f", content)
Example #13
0
 def _test_mode(self, mode):
     r = DataprocJobRunner(conf_paths=[])
     with patch.multiple(r,
                         _cleanup_cluster=mock.DEFAULT,
                         _cleanup_job=mock.DEFAULT,
                         _cleanup_local_tmp=mock.DEFAULT,
                         _cleanup_logs=mock.DEFAULT,
                         _cleanup_cloud_tmp=mock.DEFAULT) as mock_dict:
         r.cleanup(mode=mode)
         yield mock_dict
Example #14
0
    def test_bootstrap_script_respects_sh_bin(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_bin',
                         return_value=['/bin/bash']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual(lines[0].strip(), '#!/bin/bash')
Example #15
0
    def test_bootstrap_script_respects_sh_bin(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_bin',
                         return_value=['/bin/bash']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual(lines[0].strip(), '#!/bin/bash')
Example #16
0
    def test_bootstrap_script_respects_sh_pre_commands(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_pre_commands',
                         return_value=['garply', 'quux']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual([line.strip() for line in lines[1:3]],
                         ['garply', 'quux'])
Example #17
0
    def test_bootstrap_script_respects_sh_pre_commands(self):
        runner = DataprocJobRunner(conf_paths=[])

        self.start(patch('mrjob.dataproc.DataprocJobRunner._sh_pre_commands',
                         return_value=['garply', 'quux']))
        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path) as f:
            lines = list(f)

        self.assertEqual([line.strip() for line in lines[1:3]],
                         ['garply', 'quux'])
Example #18
0
    def test_usr_bin_env(self):
        runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=True, sh_bin="bash -e")

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], "#!/usr/bin/env bash -e")
Example #19
0
    def test_bootstrap_mrjob_uses_python_bin(self):
        # use all the bootstrap options
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=True,
                                   python_bin=['anaconda'])

        runner._add_bootstrap_files_for_upload()
        self.assertIsNotNone(runner._master_bootstrap_script_path)
        with open(runner._master_bootstrap_script_path, 'r') as f:
            content = f.read()

        self.assertIn('sudo anaconda -m compileall -q -f', content)
Example #20
0
    def test_usr_bin_env(self):
        runner = DataprocJobRunner(conf_paths=[],
                                   bootstrap_mrjob=True,
                                   sh_bin='bash -e')

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/usr/bin/env bash -e')
Example #21
0
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        if self.options.runner == 'emr':
            # avoid requiring dependencies (such as boto3) for other runners
            from mrjob.emr import EMRJobRunner
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'dataproc':
            from mrjob.dataproc import DataprocJobRunner
            return DataprocJobRunner(**self.dataproc_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            from mrjob.hadoop import HadoopJobRunner
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            from mrjob.local import LocalMRJobRunner
            return LocalMRJobRunner(**self.local_job_runner_kwargs())
Example #22
0
    def test_dont_take_down_cluster_on_failure(self):
        runner = DataprocJobRunner(conf_paths=[])

        cluster_body = runner.api_client.cluster_create()
        cluster_id = cluster_body['clusterName']

        mr_job = MRTwoStepJob(
            ['-r', 'dataproc', '-v', '--cluster-id', cluster_id])
        mr_job.sandbox()

        self._dataproc_client.job_get_advances_states = (collections.deque(
            ['SETUP_DONE', 'RUNNING', 'ERROR']))

        with mr_job.make_runner() as runner:
            self.assertIsInstance(runner, DataprocJobRunner)

            with logger_disabled('mrjob.dataproc'):
                self.assertRaises(StepFailedException, runner.run)

            cluster = self.get_cluster_from_runner(runner, cluster_id)
            cluster_state = self._dataproc_client.get_state(cluster)
            self.assertEqual(cluster_state, 'RUNNING')

        # job shouldn't get terminated by cleanup
        cluster = (
            self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'RUNNING')
Example #23
0
    def test_zone_beats_region(self):
        runner = DataprocJobRunner(region='europe-west1',
                                   zone='europe-west1-a')

        self.assertTrue(self.log.warning.called)
        self.assertEqual(runner._opts['region'], None)
        self.assertEqual(runner._opts['zone'], 'europe-west1-a')
Example #24
0
    def test_gcs_cat(self):
        self.put_gcs_multi({
            'gs://walrus/one': b'one_text',
            'gs://walrus/two': b'two_text',
            'gs://walrus/three': b'three_text',
        })

        runner = DataprocJobRunner(cloud_tmp_dir='gs://walrus/tmp',
                                   conf_paths=[])

        self.assertEqual(list(runner.fs.cat('gs://walrus/one')), [b'one_text'])
Example #25
0
    def test_command_line_beats_config(self):
        ZONE_CONF = dict(runners=dict(dataproc=dict(zone='us-west1-a')))

        with mrjob_conf_patcher(ZONE_CONF):
            runner = DataprocJobRunner(region='europe-west1')

            # region takes precedence because it was set on the command line
            self.assertEqual(runner._opts['region'], 'europe-west1')
            self.assertEqual(runner._opts['zone'], None)
            # only a problem if you set region and zone
            # in the same config
            self.assertFalse(self.log.warning.called)
Example #26
0
    def test_create_master_bootstrap_script(self):
        # create a fake src tarball
        foo_py_path = os.path.join(self.tmp_dir, 'foo.py')
        with open(foo_py_path, 'w'):
            pass

        # use all the bootstrap options
        runner = DataprocJobRunner(conf_paths=[],
                              bootstrap=[
                                  PYTHON_BIN + ' ' +
                                  foo_py_path + '#bar.py',
                                  'gs://walrus/scripts/ohnoes.sh#',
                                  # bootstrap_cmds
                                  'echo "Hi!"',
                                  'true',
                                  'ls',
                                  # bootstrap_scripts
                                  'speedups.sh',
                                  '/tmp/s.sh'
                              ],
                              bootstrap_mrjob=True)

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/bin/sh -ex')

        # check PWD gets stored
        self.assertIn('__mrjob_PWD=$PWD', lines)

        def assertScriptDownloads(path, name=None):
            uri = runner._upload_mgr.uri(path)
            name = runner._bootstrap_dir_mgr.name('file', path, name=name)

            self.assertIn(
                'hadoop fs -copyToLocal %s $__mrjob_PWD/%s' % (uri, name),
                lines)
            self.assertIn(
                'chmod a+x $__mrjob_PWD/%s' % (name,),
                lines)

        # check files get downloaded
        assertScriptDownloads(foo_py_path, 'bar.py')
        assertScriptDownloads('gs://walrus/scripts/ohnoes.sh')
        assertScriptDownloads(runner._mrjob_tar_gz_path)

        # check scripts get run

        # bootstrap
        self.assertIn(PYTHON_BIN + ' $__mrjob_PWD/bar.py', lines)
        self.assertIn('$__mrjob_PWD/ohnoes.sh', lines)

        self.assertIn('echo "Hi!"', lines)
        self.assertIn('true', lines)
        self.assertIn('ls', lines)

        self.assertIn('speedups.sh', lines)
        self.assertIn('/tmp/s.sh', lines)

        # bootstrap_mrjob
        mrjob_tar_gz_name = runner._bootstrap_dir_mgr.name(
            'file', runner._mrjob_tar_gz_path)
        self.assertIn("__mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from"
                      " distutils.sysconfig import get_python_lib;"
                      " print(get_python_lib())')", lines)
        self.assertIn('sudo tar xfz $__mrjob_PWD/' + mrjob_tar_gz_name +
                      ' -C $__mrjob_PYTHON_LIB', lines)
        self.assertIn('sudo ' + PYTHON_BIN + ' -m compileall -f'
                      ' $__mrjob_PYTHON_LIB/mrjob && true', lines)
        # bootstrap_python
        if PY2:
            self.assertIn('sudo apt-get install -y python-pip python-dev', lines)
        else:
            self.assertIn('sudo apt-get install -y python3 python3-pip python3-dev', lines)
Example #27
0
    def test_create_master_bootstrap_script(self):
        # create a fake src tarball
        foo_py_path = os.path.join(self.tmp_dir, 'foo.py')
        with open(foo_py_path, 'w'):
            pass

        runner = DataprocJobRunner(
            conf_paths=[],
            bootstrap=[
                PYTHON_BIN + ' ' + foo_py_path + '#bar.py',
                'gs://walrus/scripts/ohnoes.sh#', 'echo "Hi!"', 'true', 'ls',
                'speedups.sh', '/tmp/s.sh'
            ],
            bootstrap_mrjob=True)

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], '#!/bin/sh -ex')

        # check PWD gets stored
        self.assertIn('__mrjob_PWD=$PWD', lines)

        def assertScriptDownloads(path, name=None):
            uri = runner._upload_mgr.uri(path)
            name = runner._bootstrap_dir_mgr.name('file', path, name=name)

            self.assertIn(
                '  hadoop fs -copyToLocal %s $__mrjob_PWD/%s' % (uri, name),
                lines)
            self.assertIn('  chmod u+rx $__mrjob_PWD/%s' % (name, ), lines)

        # check files get downloaded
        assertScriptDownloads(foo_py_path, 'bar.py')
        assertScriptDownloads('gs://walrus/scripts/ohnoes.sh')
        assertScriptDownloads(runner._mrjob_zip_path)

        # check scripts get run

        # bootstrap
        self.assertIn('  ' + PYTHON_BIN + ' $__mrjob_PWD/bar.py', lines)
        self.assertIn('  $__mrjob_PWD/ohnoes.sh', lines)

        self.assertIn('  echo "Hi!"', lines)
        self.assertIn('  true', lines)
        self.assertIn('  ls', lines)

        self.assertIn('  speedups.sh', lines)
        self.assertIn('  /tmp/s.sh', lines)

        # bootstrap_mrjob
        mrjob_zip_name = runner._bootstrap_dir_mgr.name(
            'file', runner._mrjob_zip_path)
        self.assertIn(
            "  __mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from"
            " distutils.sysconfig import get_python_lib;"
            " print(get_python_lib())')", lines)
        self.assertIn(
            '  sudo unzip $__mrjob_PWD/' + mrjob_zip_name +
            ' -d $__mrjob_PYTHON_LIB', lines)
        self.assertIn(
            '  sudo ' + PYTHON_BIN + ' -m compileall -q -f'
            ' $__mrjob_PYTHON_LIB/mrjob && true', lines)
        # bootstrap_python
        if PY2:
            self.assertIn('  sudo apt-get install -y python-pip python-dev',
                          lines)
        else:
            self.assertIn(
                '  sudo apt-get install -y python3 python3-pip python3-dev',
                lines)
Example #28
0
    def test_no_bootstrap_script_if_not_needed(self):
        runner = DataprocJobRunner(conf_paths=[], bootstrap_mrjob=False,
                              bootstrap_python=False)

        runner._add_bootstrap_files_for_upload()
        self.assertIsNone(runner._master_bootstrap_script_path)
Example #29
0
    def test_explicit_tmp_uri(self):
        self._make_bucket('walrus', US_EAST_GCE_REGION)

        runner = DataprocJobRunner(cloud_tmp_dir='gs://walrus/tmp/')

        self.assertEqual(runner._cloud_tmp_dir, 'gs://walrus/tmp/')
Example #30
0
    def test_reuse_mrjob_bucket_in_same_region(self):
        self._make_bucket('mrjob-1', DEFAULT_GCE_REGION)

        runner = DataprocJobRunner()
        self.assertEqual(runner._cloud_tmp_dir, 'gs://mrjob-1/tmp/')
Example #31
0
 def test_cannot_be_empty(self):
     runner = DataprocJobRunner(region='')
     self.assertEqual(runner._gce_region, 'us-central1')
Example #32
0
 def _quick_runner(self):
     r = DataprocJobRunner(conf_paths=[])
     r._cluster_id = 'j-ESSEOWENS'
     r._ran_job = False
     return r
Example #33
0
 def _quick_runner(self):
     r = DataprocJobRunner(conf_paths=[])
     r._cluster_id = 'j-ESSEOWENS'
     r._ran_job = False
     return r
Example #34
0
 def test_default(self):
     runner = DataprocJobRunner()
     self.assertEqual(runner._opts['region'], 'us-west1')
     self.assertEqual(runner._opts['zone'], None)
     self.assertFalse(self.log.warning.called)
Example #35
0
 def test_explicit_zone(self):
     runner = DataprocJobRunner(zone='europe-west1-a')
     self.assertEqual(runner._opts['zone'], 'europe-west1-a')
Example #36
0
    def test_region_from_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1'
            runner = DataprocJobRunner()

        self.assertEqual(runner._opts['region'], 'us-east1')
Example #37
0
    def test_explicit_region_beats_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_REGION'] = 'us-east1'
            runner = DataprocJobRunner(region='europe-west1-a')

        self.assertEqual(runner._opts['region'], 'europe-west1-a')
Example #38
0
    def test_zone_from_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b'
            runner = DataprocJobRunner()

        self.assertEqual(runner._opts['zone'], 'us-west1-b')
Example #39
0
    def test_create_master_bootstrap_script(self):
        # create a fake src tarball
        foo_py_path = os.path.join(self.tmp_dir, "foo.py")
        with open(foo_py_path, "w"):
            pass

        # use all the bootstrap options
        runner = DataprocJobRunner(
            conf_paths=[],
            bootstrap=[
                PYTHON_BIN + " " + foo_py_path + "#bar.py",
                "gs://walrus/scripts/ohnoes.sh#",
                # bootstrap_cmds
                'echo "Hi!"',
                "true",
                "ls",
                # bootstrap_scripts
                "speedups.sh",
                "/tmp/s.sh",
            ],
            bootstrap_mrjob=True,
        )

        runner._add_bootstrap_files_for_upload()

        self.assertIsNotNone(runner._master_bootstrap_script_path)
        self.assertTrue(os.path.exists(runner._master_bootstrap_script_path))

        with open(runner._master_bootstrap_script_path) as f:
            lines = [line.rstrip() for line in f]

        self.assertEqual(lines[0], "#!/bin/sh -ex")

        # check PWD gets stored
        self.assertIn("__mrjob_PWD=$PWD", lines)

        def assertScriptDownloads(path, name=None):
            uri = runner._upload_mgr.uri(path)
            name = runner._bootstrap_dir_mgr.name("file", path, name=name)

            self.assertIn("hadoop fs -copyToLocal %s $__mrjob_PWD/%s" % (uri, name), lines)
            self.assertIn("chmod a+x $__mrjob_PWD/%s" % (name,), lines)

        # check files get downloaded
        assertScriptDownloads(foo_py_path, "bar.py")
        assertScriptDownloads("gs://walrus/scripts/ohnoes.sh")
        assertScriptDownloads(runner._mrjob_zip_path)

        # check scripts get run

        # bootstrap
        self.assertIn(PYTHON_BIN + " $__mrjob_PWD/bar.py", lines)
        self.assertIn("$__mrjob_PWD/ohnoes.sh", lines)

        self.assertIn('echo "Hi!"', lines)
        self.assertIn("true", lines)
        self.assertIn("ls", lines)

        self.assertIn("speedups.sh", lines)
        self.assertIn("/tmp/s.sh", lines)

        # bootstrap_mrjob
        mrjob_zip_name = runner._bootstrap_dir_mgr.name("file", runner._mrjob_zip_path)
        self.assertIn(
            "__mrjob_PYTHON_LIB=$(" + PYTHON_BIN + " -c 'from"
            " distutils.sysconfig import get_python_lib;"
            " print(get_python_lib())')",
            lines,
        )
        self.assertIn("sudo unzip $__mrjob_PWD/" + mrjob_zip_name + " -d $__mrjob_PYTHON_LIB", lines)
        self.assertIn("sudo " + PYTHON_BIN + " -m compileall -q -f" " $__mrjob_PYTHON_LIB/mrjob && true", lines)
        # bootstrap_python
        if PY2:
            self.assertIn("sudo apt-get install -y python-pip python-dev", lines)
        else:
            self.assertIn("sudo apt-get install -y python3 python3-pip python3-dev", lines)
Example #40
0
 def test_default(self):
     runner = DataprocJobRunner()
     self.assertEqual(runner._gce_region, 'us-central1')
Example #41
0
 def test_explicit_region(self):
     runner = DataprocJobRunner(region='europe-west1')
     self.assertEqual(runner._gce_region, 'europe-west1')
Example #42
0
    def test_explicit_zone_beats_environment(self):
        with save_current_environment():
            os.environ['CLOUDSDK_COMPUTE_ZONE'] = 'us-west1-b'
            runner = DataprocJobRunner(zone='europe-west1-a')

        self.assertEqual(runner._opts['zone'], 'europe-west1-a')