def test_spark_master_yarn(self): runner = SparkMRJobRunner(spark_master='yarn') self.assertTrue(is_uri(runner._spark_tmp_dir)) self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///') self.assertIsNotNone(runner._upload_mgr)
def test_spark_master_mesos(self): runner = SparkMRJobRunner(spark_master='mesos://host:12345') self.assertTrue(is_uri(runner._spark_tmp_dir)) self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///') self.assertIsNotNone(runner._upload_mgr)
def test_default(self): runner = SparkMRJobRunner() self.assertFalse(is_uri(runner._spark_tmp_dir)) self.assertIsNone(runner._upload_mgr) self.assertEqual(runner._spark_tmp_dir[-6:], '-spark')
def test_explicit_spark_tmp_dir_uri(self): runner = SparkMRJobRunner(spark_master='mesos://host:12345', spark_tmp_dir='s3://walrus/tmp') self.assertTrue(runner._spark_tmp_dir.startswith('s3://walrus/tmp/')) self.assertGreater(len(runner._spark_tmp_dir), len('s3://walrus/tmp/')) self.assertIsNotNone(runner._upload_mgr)
def test_explicit_spark_tmp_dir_path(self): # posixpath.join() and os.path.join() are the same on UNIX self.start(patch('os.path.join', lambda *paths: '/./'.join(paths))) runner = SparkMRJobRunner(spark_tmp_dir='/path/to/tmp') self.assertTrue(runner._spark_tmp_dir.startswith('/path/to/tmp/./')) self.assertGreater(len(runner._spark_tmp_dir), len('/path/to/tmp/./')) self.assertIsNone(runner._upload_mgr)
def test_ignore_format_and_sort_kwargs(self): # hadoop formats and SORT_VALUES are read directly from the job, # so the runner's constructor ignores the corresponding kwargs # # see #2022 # same set up as test_sort_values(), above runner = SparkMRJobRunner( mr_job_script=MRSortAndGroup.mr_job_script(), mrjob_cls=MRSortAndGroup, stdin=BytesIO( b'alligator\nactuary\nbowling\nartichoke\nballoon\nbaby\n'), hadoop_input_format='TerribleInputFormat', hadoop_output_format='AwfulOutputFormat', sort_values=False) runner.run() self.assertEqual( dict(MRSortAndGroup().parse_output(runner.cat_output())), dict(a=['actuary', 'alligator', 'artichoke'], b=['baby', 'balloon', 'bowling']))
def test_spark_master_local(self): runner = SparkMRJobRunner(spark_master='local[*]') self.assertFalse(is_uri(runner._spark_tmp_dir)) self.assertIsNone(runner._upload_mgr)
def test_local_uri_with_non_local_runner(self): SparkMRJobRunner(spark_tmp_dir='/tmp', spark_master='mesos://host:12345') self.assertTrue(self.log.warning.called)
def test_non_local_uri_with_local_runner(self): SparkMRJobRunner(spark_tmp_dir='s3://walrus/tmp') self.assertTrue(self.log.warning.called)