def test_logging_stderr_in_cleanup(self): def mock_Popen(*args, **kwargs): mock_proc = MagicMock() mock_proc.stdout = MagicMock() mock_proc.stdout.__iter__.return_value = [b'line1\n', b'line2\n'] mock_proc.stderr = MagicMock() mock_proc.stderr.__iter__.return_value = [ b'Emergency, everybody to get from street\n' ] mock_proc.wait.return_value = 0 return mock_proc self.start(patch('mrjob.fs.hadoop.Popen', mock_Popen)) mock_log = self.start(patch('mrjob.fs.hadoop.log')) fs = HadoopFilesystem() data = b''.join(fs._cat_file('/some/path')) self.assertEqual(data, b'line1\nline2\n') mock_log.error.assert_called_once_with( 'STDERR: Emergency, everybody to get from street')
def test_logging_stderr_in_cleanup(self): def mock_Popen(*args, **kwargs): mock_proc = MagicMock() mock_proc.stdout = MagicMock() mock_proc.stdout.__iter__.return_value = [ b'line1\n', b'line2\n'] mock_proc.stderr = MagicMock() mock_proc.stderr.__iter__.return_value = [ b'Emergency, everybody to get from street\n'] mock_proc.wait.return_value = 0 return mock_proc self.start(patch('mrjob.fs.hadoop.Popen', mock_Popen)) mock_log = self.start(patch('mrjob.fs.hadoop.log')) fs = HadoopFilesystem() data = b''.join(fs._cat_file('/some/path')) self.assertEqual(data, b'line1\nline2\n') mock_log.error.assert_called_once_with( 'STDERR: Emergency, everybody to get from street')
def test_deprecated_hadoop_home_option(self): hadoop_home = join(self.tmp_dir, 'hadoop_home_option') hadoop_bin = self.makefile(join(hadoop_home, 'bin', 'hadoop'), executable=True) # deprecation warning is in HadoopJobRunner self.fs = HadoopFilesystem(hadoop_home=hadoop_home) with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin])
def setUp(self): super(FindHadoopBinTestCase, self).setUp() # track calls to which() self.which = self.start(patch('mrjob.fs.hadoop.which', wraps=which)) # keep which() from searching in /bin, etc. os.environ['PATH'] = self.tmp_dir # create basic HadoopFilesystem (okay to overwrite) self.fs = HadoopFilesystem()
def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['project_id'], location=self._opts['gcs_region'], object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, ), disable_if=_is_permanent_google_error) # Hadoop FS is responsible for all URIs that fall through to it self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['google_project_id'] ), disable_if=_is_permanent_google_error) self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs
def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() # don't pass [] to fs; this means not to use hadoop until # fs.set_hadoop_bin() is called (used for running hadoop over SSH). hadoop_bin = self._opts['hadoop_bin'] or None self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin)) self._fs.add_fs('local', LocalFilesystem()) return self._fs
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home') self.makefile( os.path.join('mock_hadoop_home', 'contrib', 'streaming', 'hadoop-0.X.Y-streaming.jar'), 'i are java bytecode', ) self.env['MOCK_HADOOP_TMP'] = self.makedirs('mock_hadoop') self.env['MOCK_HADOOP_VERSION'] = '2.7.1' self.env['USER'] = '******' def make_mock_file(self, name, contents='contents'): return self.makefile(os.path.join(get_mock_hdfs_root(self.env), name), contents) def test_cat_uncompressed(self): self.make_mock_file('data/foo', 'foo\nfoo\n') remote_path = self.fs.join('hdfs:///data', 'foo') self.assertEqual(b''.join(self.fs._cat_file(remote_path)), b'foo\nfoo\n') def test_cat_bz2(self): self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000)) remote_path = self.fs.join('hdfs:///data', 'foo.bz2') self.assertEqual(b''.join(self.fs._cat_file(remote_path)), b'foo\n' * 1000) def test_cat_gz(self): self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000)) remote_path = self.fs.join('hdfs:///data', 'foo.gz') self.assertEqual(b''.join(self.fs._cat_file(remote_path)), b'foo\n' * 10000) def test_ls_empty(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) def test_ls_basic(self): self.make_mock_file('f') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f']) def test_ls_basic_2(self): self.make_mock_file('f') self.make_mock_file('f2') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///f2']) def test_ls_recurse(self): self.make_mock_file('f') self.make_mock_file('d/f2') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///d/f2', 'hdfs:///f']) def test_ls_s3n(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_mock_file('f', 'foo') self.make_mock_file('f3 win', 'foo' * 10) self.assertEqual(sorted(self.fs.ls('s3n://bucket/')), ['s3n://bucket/f', 's3n://bucket/f3 win']) def test_ls_s3a(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_mock_file('f', 'foo') self.make_mock_file('f3 win', 'foo' * 10) self.assertEqual(sorted(self.fs.ls('s3a://bucket/')), ['s3a://bucket/f', 's3a://bucket/f3 win']) def test_single_space(self): self.make_mock_file('foo bar') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_double_space(self): self.make_mock_file('foo bar') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_du(self): self.make_mock_file('data1', 'abcd') self.make_mock_file('more/data2', 'defg') self.make_mock_file('more/data3', 'hijk') self.assertEqual(self.fs.du('hdfs:///'), 12) self.assertEqual(self.fs.du('hdfs:///data1'), 4) self.assertEqual(self.fs.du('hdfs:///more'), 8) self.assertEqual(self.fs.du('hdfs:///more/*'), 8) self.assertEqual(self.fs.du('hdfs:///more/data2'), 4) self.assertEqual(self.fs.du('hdfs:///more/data3'), 4) def test_du_non_existent(self): self.assertEqual(self.fs.du('hdfs:///does-not-exist'), 0) def test_mkdir(self): self.fs.mkdir('hdfs:///d/ave') local_path = os.path.join(get_mock_hdfs_root(self.env), 'd', 'ave') self.assertEqual(os.path.isdir(local_path), True) def test_exists_no(self): path = 'hdfs:///f' self.assertEqual(self.fs.exists(path), False) def test_exists_yes(self): self.make_mock_file('f') path = 'hdfs:///f' self.assertEqual(self.fs.exists(path), True) def test_rm(self): local_path = self.make_mock_file('f') self.assertEqual(os.path.exists(local_path), True) self.fs.rm('hdfs:///f') self.assertEqual(os.path.exists(local_path), False) def test_rm_recursive(self): local_path = self.make_mock_file('foo/bar') self.assertEqual(os.path.exists(local_path), True) self.fs.rm('hdfs:///foo') # remove containing directory self.assertEqual(os.path.exists(local_path), False) def test_rm_nonexistent(self): self.fs.rm('hdfs:///baz') def test_touchz(self): # mockhadoop doesn't implement this. pass
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home') self.makefile( os.path.join( 'mock_hadoop_home', 'contrib', 'streaming', 'hadoop-0.X.Y-streaming.jar'), 'i are java bytecode', ) self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root') self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output') self.env['USER'] = '******' # don't set MOCK_HADOOP_LOG, we get command history other ways def make_hdfs_file(self, name, contents='contents'): return self.makefile(os.path.join('mock_hdfs_root', name), contents) def make_hdfs_dir(self, name): return self.makedirs(os.path.join('mock_hdfs_root', name)) def make_hdfs_tree(self, path, files=None): if files is None: files = ('f', 'g/a/b', 'g/a/a/b') test_files = [] for f in sorted(files): f = os.path.join(path, f) self.make_hdfs_file(f, f) test_files.append("hdfs:///" + f) self.assertEqual( sorted(self.fs.ls("hdfs:///" + path.rstrip('/') + '/*')), test_files ) return path def test_ls_empty(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) def test_ls_basic(self): self.make_hdfs_file('f') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f']) def test_ls_basic_2(self): self.make_hdfs_file('f') self.make_hdfs_file('f2') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///f2']) def test_ls_recurse(self): self.make_hdfs_file('f') self.make_hdfs_file('d/f2') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///d/f2']) def test_ls_s3n(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_hdfs_file('f', 'foo') self.make_hdfs_file('f3 win', 'foo' * 10) self.assertItemsEqual(list(self.fs.ls('s3n://bucket/')), ['s3n://bucket/f', 's3n://bucket/f3 win']) def test_single_space(self): self.make_hdfs_file('foo bar') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_double_space(self): self.make_hdfs_file('foo bar') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_cat_uncompressed(self): # mockhadoop doesn't support compressed files, so we won't test for it. # this is only a sanity check anyway. self.make_hdfs_file('data/foo', 'foo\nfoo\n') remote_path = self.fs.path_join('hdfs:///data', 'foo') self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n', 'foo\n']) def test_write_str(self): path = 'hdfs:///write-test-str' content = 'some content!' self.fs.write(path, content) self.assertEqual("".join(self.fs.cat(path)), content) def test_write_file(self): path = 'hdfs:///write-test-fileobj' content = StringIO('some content!') self.fs.write(path, content) self.assertEqual("".join(self.fs.cat(path)), content.getvalue()) def test_write_overwrite(self): self.make_hdfs_file('existing', 'this file already exists') self.assertRaises(OSError, self.fs.write, 'hdfs:///existing', 'can not overwrite') def test_copy_from_local(self): content = 'file filler' dst = 'hdfs:///hadoop-copy' src = self.makefile('local-source', content) self.fs.copy_from_local(dst, src) self.assertEqual("".join(self.fs.cat(dst)), content) def test_copy_from_local_override(self): src = self.makefile('local-source', 'source') self.make_hdfs_file('existing', 'this file already exists') self.assertRaises(OSError, self.fs.copy_from_local, 'hdfs:///existing', src) def test_du(self): self.make_hdfs_file('data1', 'abcd') self.make_hdfs_file('more/data2', 'defg') self.make_hdfs_file('more/data3', 'hijk') self.assertEqual(self.fs.du('hdfs:///'), 12) self.assertEqual(self.fs.du('hdfs:///data1'), 4) self.assertEqual(self.fs.du('hdfs:///more'), 8) self.assertEqual(self.fs.du('hdfs:///more/*'), 8) self.assertEqual(self.fs.du('hdfs:///more/data2'), 4) self.assertEqual(self.fs.du('hdfs:///more/data3'), 4) def test_mkdir(self): self.fs.mkdir('hdfs:///d') local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd') self.assertEqual(os.path.isdir(local_path), True) def test_path_exists_no(self): path = 'hdfs:///f' self.assertEqual(self.fs.path_exists(path), False) def test_path_exists_yes(self): self.make_hdfs_file('f') path = 'hdfs:///f' self.assertEqual(self.fs.path_exists(path), True) def test_rm(self): local_path = self.make_hdfs_file('f') self.assertEqual(os.path.exists(local_path), True) self.fs.rm('hdfs:///f') self.assertEqual(os.path.exists(local_path), False) def test_rm_tree_noslash_files(self): path = "icio/goodbye-1" hdfs_path = "hdfs:///%s" % path real_path = self.make_hdfs_dir(path) self.make_hdfs_tree(path) self.fs.rm(hdfs_path.rstrip("/")) # Check that the directory and its files have been removed self.assertEqual(os.path.isdir(real_path), False) self.assertEqual(self.fs.path_exists(path), False) self.assertEqual(list(self.fs.ls(hdfs_path)), []) def test_rm_tree_slash_files(self): path = "icio/goodbye-2" hdfs_path = "hdfs:///%s" % path real_path = self.make_hdfs_dir(path) self.make_hdfs_tree(path) self.fs.rm(hdfs_path.rstrip("/") + "/") # Check that the directory and its files have been removed self.assertEqual(os.path.isdir(real_path), False) self.assertEqual(self.fs.path_exists(hdfs_path), False) self.assertEqual(list(self.fs.ls(hdfs_path)), []) def test_rm_tree_star_files(self): path = "icio/goodbye-3" hdfs_path = "hdfs:///%s" % path real_path = self.make_hdfs_dir(path) self.make_hdfs_tree('icio/goodbye-3') self.fs.rm(hdfs_path.rstrip("/") + "/*") # Check that the files have been removed but not the root directory self.assertEqual(os.path.isdir(real_path), True) self.assertEqual(self.fs.path_exists(hdfs_path), True) self.assertEqual(list(self.fs.ls(hdfs_path)), []) def test_touchz(self): # mockhadoop doesn't implement this. pass
def test_predefined_hadoop_bin(self): self.fs = HadoopFilesystem(hadoop_bin=['hadoop', '-v']) self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop', '-v']) self.assertFalse(self.which.called)
class FindHadoopBinTestCase(SandboxedTestCase): def setUp(self): super(FindHadoopBinTestCase, self).setUp() # track calls to which() self.which = self.start(patch('mrjob.fs.hadoop.which', wraps=which)) # keep which() from searching in /bin, etc. os.environ['PATH'] = self.tmp_dir # create basic HadoopFilesystem (okay to overwrite) self.fs = HadoopFilesystem() def _add_hadoop_bin_for_envvar(self, envvar, *dirnames): """Add a fake "Hadoop" binary to its own subdirectory of ``self.tmp_dir``, and set *os.environ[envvar]* to point at it. You can use *dirnames* to put the binary in a subdirectory of *os.environ[envvar]* (e.g. ``'bin'``). return the path to the fake Hadoop binary. """ os.environ[envvar] = join(self.tmp_dir, envvar.lower()) hadoop_bin_path = join(join(os.environ[envvar], *dirnames), 'hadoop') self.makefile(hadoop_bin_path, executable=True) return hadoop_bin_path # tests without environment variables def test_do_nothing_on_init(self): self.assertFalse(self.which.called) def test_fallback(self): self.assertFalse(self.which.called) with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop']) self.which.assert_called_once_with('hadoop', path=None) def test_predefined_hadoop_bin(self): self.fs = HadoopFilesystem(hadoop_bin=['hadoop', '-v']) self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop', '-v']) self.assertFalse(self.which.called) # environment variable tests def _test_environment_variable(self, envvar, *dirnames): """Check if we can find the hadoop binary from *envvar*""" # okay to add after HadoopFilesystem() created; it hasn't looked yet hadoop_bin = self._add_hadoop_bin_for_envvar(envvar, *dirnames) with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin]) def test_hadoop_prefix(self): self._test_environment_variable('HADOOP_PREFIX', 'bin') def test_hadoop_home_envvar(self): self._test_environment_variable('HADOOP_HOME', 'bin') def test_hadoop_install(self): self._test_environment_variable('HADOOP_INSTALL', 'bin') def test_hadoop_install_hadoop_subdir(self): self._test_environment_variable('HADOOP_INSTALL', 'hadoop', 'bin') def test_path(self): self._test_environment_variable('PATH') def test_two_part_path(self): hadoop_path1 = join(self.tmp_dir, 'path1') hadoop_path1_bin = self.makefile(join(hadoop_path1, 'hadoop'), executable=True) hadoop_path2 = join(self.tmp_dir, 'path2') hadoop_path2_bin = self.makefile(join(hadoop_path2, 'hadoop'), executable=True) os.environ['PATH'] = ':'.join([hadoop_path1, hadoop_path2]) with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_path1_bin]) self.assertNotEqual(self.fs.get_hadoop_bin(), [hadoop_path2_bin]) def test_hadoop_mapred_home(self): self._test_environment_variable('HADOOP_MAPRED_HOME', 'bin') def test_hadoop_anything_home(self): self._test_environment_variable('HADOOP_ANYTHING_HOME', 'bin') def test_other_environment_variable(self): self._add_hadoop_bin_for_envvar('HADOOP_YARN_MRJOB_DIR', 'bin') with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop']) # precedence tests def test_hadoop_prefix_beats_hadoop_home_envvar(self): self._add_hadoop_bin_for_envvar('HADOOP_HOME', 'bin') self.test_hadoop_prefix() def test_hadoop_home_envvar_beats_hadoop_install(self): self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'bin') self.test_hadoop_home_envvar() def test_hadoop_install_beats_hadoop_install_subdir(self): self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'hadoop', 'bin') # verify that this test and test_hadoop_install() use same value # for $HADOOP_INSTALL hadoop_install = os.environ['HADOOP_INSTALL'] self.test_hadoop_install() self.assertEqual(hadoop_install, os.environ['HADOOP_INSTALL']) def test_hadoop_install_hadoop_subdir_beats_path(self): self._add_hadoop_bin_for_envvar('PATH') self.test_hadoop_install_hadoop_subdir() def test_path_beats_hadoop_mapred_home(self): self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin') self.test_path() def test_hadoop_anything_home_is_alphabetical(self): # $HADOOP_ANYTHING_HOME comes before $HADOOP_MAPRED_HOME self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin') self.test_hadoop_anything_home()
def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(["hadoop"]) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home') self.makefile( os.path.join( 'mock_hadoop_home', 'contrib', 'streaming', 'hadoop-0.X.Y-streaming.jar'), 'i are java bytecode', ) self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root') self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output') self.env['USER'] = '******' # don't set MOCK_HADOOP_LOG, we get command history other ways def make_hdfs_file(self, name, contents): return self.makefile(os.path.join('mock_hdfs_root', name), contents) def test_ls_empty(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) def test_ls_basic(self): self.make_hdfs_file('f', 'contents') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f']) def test_ls_basic_2(self): self.make_hdfs_file('f', 'contents') self.make_hdfs_file('f2', 'contents') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///f2']) def test_ls_recurse(self): self.make_hdfs_file('f', 'contents') self.make_hdfs_file('d/f2', 'contents') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///d/f2']) def test_cat_uncompressed(self): # mockhadoop doesn't support compressed files, so we won't test for it. # this is only a sanity check anyway. self.makefile(os.path.join('mock_hdfs_root', 'data', 'foo'), 'foo\nfoo\n') remote_path = self.fs.path_join('hdfs:///data', 'foo') self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n', 'foo\n']) def test_du(self): self.makefile(os.path.join('mock_hdfs_root', 'data1'), 'abcd') self.makedirs('mock_hdfs_root/more') self.makefile(os.path.join('mock_hdfs_root', 'more', 'data2'), 'defg') self.makefile(os.path.join('mock_hdfs_root', 'more', 'data3'), 'hijk') self.assertEqual(self.fs.du('hdfs:///'), 12) self.assertEqual(self.fs.du('hdfs:///data1'), 4) self.assertEqual(self.fs.du('hdfs:///more'), 8) self.assertEqual(self.fs.du('hdfs:///more/*'), 8) self.assertEqual(self.fs.du('hdfs:///more/data2'), 4) self.assertEqual(self.fs.du('hdfs:///more/data3'), 4) def test_mkdir(self): self.fs.mkdir('hdfs:///d') local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd') self.assertEqual(os.path.isdir(local_path), True) def test_rm(self): local_path = self.make_hdfs_file('f', 'contents') self.assertEqual(os.path.exists(local_path), True) self.fs.rm('hdfs:///f') self.assertEqual(os.path.exists(local_path), False) def test_touchz(self): # mockhadoop doesn't implement this. pass
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home') self.makefile( os.path.join( 'mock_hadoop_home', 'contrib', 'streaming', 'hadoop-0.X.Y-streaming.jar'), 'i are java bytecode', ) self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root') self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output') self.env['USER'] = '******' # don't set MOCK_HADOOP_LOG, we get command history other ways def make_mock_file(self, name, contents='contents'): return self.makefile(os.path.join('mock_hdfs_root', name), contents) def test_ls_empty(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) def test_ls_basic(self): self.make_mock_file('f') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f']) def test_ls_basic_2(self): self.make_mock_file('f') self.make_mock_file('f2') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///f2']) def test_ls_recurse(self): self.make_mock_file('f') self.make_mock_file('d/f2') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///d/f2', 'hdfs:///f']) def test_ls_s3n(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_mock_file('f', 'foo') self.make_mock_file('f3 win', 'foo' * 10) self.assertEqual(sorted(self.fs.ls('s3n://bucket/')), ['s3n://bucket/f', 's3n://bucket/f3 win']) def test_single_space(self): self.make_mock_file('foo bar') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_double_space(self): self.make_mock_file('foo bar') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_cat_uncompressed(self): self.make_mock_file('data/foo', 'foo\nfoo\n') remote_path = self.fs.path_join('hdfs:///data', 'foo') self.assertEqual(list(self.fs._cat_file(remote_path)), [b'foo\n', b'foo\n']) def test_cat_bz2(self): self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000)) remote_path = self.fs.path_join('hdfs:///data', 'foo.bz2') self.assertEqual(list(self.fs._cat_file(remote_path)), [b'foo\n'] * 1000) def test_cat_gz(self): self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000)) remote_path = self.fs.path_join('hdfs:///data', 'foo.gz') self.assertEqual(list(self.fs._cat_file(remote_path)), [b'foo\n'] * 10000) def test_du(self): self.make_mock_file('data1', 'abcd') self.make_mock_file('more/data2', 'defg') self.make_mock_file('more/data3', 'hijk') self.assertEqual(self.fs.du('hdfs:///'), 12) self.assertEqual(self.fs.du('hdfs:///data1'), 4) self.assertEqual(self.fs.du('hdfs:///more'), 8) self.assertEqual(self.fs.du('hdfs:///more/*'), 8) self.assertEqual(self.fs.du('hdfs:///more/data2'), 4) self.assertEqual(self.fs.du('hdfs:///more/data3'), 4) def test_mkdir(self): for hadoop_version in ['0.20.0', '0.23.0', '1.2.0', '2.0.0']: self.env['MOCK_HADOOP_VERSION'] = hadoop_version self.fs.mkdir('hdfs:///d') local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd') self.assertEqual(os.path.isdir(local_path), True) def test_path_exists_no(self): path = 'hdfs:///f' self.assertEqual(self.fs.path_exists(path), False) def test_path_exists_yes(self): self.make_mock_file('f') path = 'hdfs:///f' self.assertEqual(self.fs.path_exists(path), True) def test_rm(self): local_path = self.make_mock_file('f') self.assertEqual(os.path.exists(local_path), True) self.fs.rm('hdfs:///f') self.assertEqual(os.path.exists(local_path), False) def test_touchz(self): # mockhadoop doesn't implement this. pass
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home') self.makefile( os.path.join( 'mock_hadoop_home', 'contrib', 'streaming', 'hadoop-0.X.Y-streaming.jar'), 'i are java bytecode', ) self.env['MOCK_HADOOP_TMP'] = self.makedirs('mock_hadoop') self.env['MOCK_HADOOP_VERSION'] = '2.7.1' self.env['USER'] = '******' def make_mock_file(self, name, contents='contents'): return self.makefile( os.path.join(get_mock_hdfs_root(self.env), name), contents) def test_cat_uncompressed(self): self.make_mock_file('data/foo', 'foo\nfoo\n') remote_path = self.fs.join('hdfs:///data', 'foo') self.assertEqual( b''.join(self.fs._cat_file(remote_path)), b'foo\nfoo\n') def test_cat_bz2(self): self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000)) remote_path = self.fs.join('hdfs:///data', 'foo.bz2') self.assertEqual( b''.join(self.fs._cat_file(remote_path)), b'foo\n' * 1000) def test_cat_gz(self): self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000)) remote_path = self.fs.join('hdfs:///data', 'foo.gz') self.assertEqual( b''.join(self.fs._cat_file(remote_path)), b'foo\n' * 10000) def test_ls_empty(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) def test_ls_basic(self): self.make_mock_file('f') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f']) def test_ls_basic_2(self): self.make_mock_file('f') self.make_mock_file('f2') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///f2']) def test_ls_recurse(self): self.make_mock_file('f') self.make_mock_file('d/f2') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///d/f2', 'hdfs:///f']) def test_ls_s3n(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_mock_file('f', 'foo') self.make_mock_file('f3 win', 'foo' * 10) self.assertEqual(sorted(self.fs.ls('s3n://bucket/')), ['s3n://bucket/f', 's3n://bucket/f3 win']) def test_ls_s3a(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_mock_file('f', 'foo') self.make_mock_file('f3 win', 'foo' * 10) self.assertEqual(sorted(self.fs.ls('s3a://bucket/')), ['s3a://bucket/f', 's3a://bucket/f3 win']) def test_single_space(self): self.make_mock_file('foo bar') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_double_space(self): self.make_mock_file('foo bar') self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_du(self): self.make_mock_file('data1', 'abcd') self.make_mock_file('more/data2', 'defg') self.make_mock_file('more/data3', 'hijk') self.assertEqual(self.fs.du('hdfs:///'), 12) self.assertEqual(self.fs.du('hdfs:///data1'), 4) self.assertEqual(self.fs.du('hdfs:///more'), 8) self.assertEqual(self.fs.du('hdfs:///more/*'), 8) self.assertEqual(self.fs.du('hdfs:///more/data2'), 4) self.assertEqual(self.fs.du('hdfs:///more/data3'), 4) def test_du_non_existent(self): self.assertEqual(self.fs.du('hdfs:///does-not-exist'), 0) def test_exists_no(self): path = 'hdfs:///f' self.assertEqual(self.fs.exists(path), False) def test_exists_yes(self): self.make_mock_file('f') path = 'hdfs:///f' self.assertEqual(self.fs.exists(path), True) def test_mkdir(self): self.fs.mkdir('hdfs:///d/ave') path_in_mock_hdfs = os.path.join( get_mock_hdfs_root(self.env), 'd', 'ave') self.assertEqual(os.path.isdir(path_in_mock_hdfs), True) def test_put(self): local_path = self.makefile('foo', contents=b'bar') dest = 'hdfs:///bar' self.fs.put(local_path, dest) self.assertEqual(b''.join(self.fs.cat(dest)), b'bar') def test_no_put_to_dir(self): local_path = self.makefile('foo', contents=b'bar') self.assertRaises(ValueError, self.fs.put, local_path, 'hdfs:///') def test_rm(self): path_in_mock_hdfs = self.make_mock_file('f') self.assertEqual(os.path.exists(path_in_mock_hdfs), True) self.fs.rm('hdfs:///f') self.assertEqual(os.path.exists(path_in_mock_hdfs), False) def test_rm_recursive(self): path_in_mock_hdfs = self.make_mock_file('foo/bar') self.assertEqual(os.path.exists(path_in_mock_hdfs), True) self.fs.rm('hdfs:///foo') # remove containing directory self.assertEqual(os.path.exists(path_in_mock_hdfs), False) def test_rm_nonexistent(self): self.fs.rm('hdfs:///baz') def test_touchz(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) self.fs.touchz('hdfs:///empty') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///empty'])
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home') self.makefile( os.path.join('mock_hadoop_home', 'contrib', 'streaming', 'hadoop-0.X.Y-streaming.jar'), 'i are java bytecode', ) self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root') self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output') self.env['USER'] = '******' # don't set MOCK_HADOOP_LOG, we get command history other ways def make_mock_file(self, name, contents='contents'): return self.makefile(os.path.join('mock_hdfs_root', name), contents) def test_ls_empty(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) def test_ls_basic(self): self.make_mock_file('f') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f']) def test_ls_basic_2(self): self.make_mock_file('f') self.make_mock_file('f2') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///f2']) def test_ls_recurse(self): self.make_mock_file('f') self.make_mock_file('d/f2') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///d/f2']) def test_ls_s3n(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_mock_file('f', 'foo') self.make_mock_file('f3 win', 'foo' * 10) self.assertItemsEqual(list(self.fs.ls('s3n://bucket/')), ['s3n://bucket/f', 's3n://bucket/f3 win']) def test_single_space(self): self.make_mock_file('foo bar') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_double_space(self): self.make_mock_file('foo bar') self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo bar']) def test_cat_uncompressed(self): self.make_mock_file('data/foo', 'foo\nfoo\n') remote_path = self.fs.path_join('hdfs:///data', 'foo') self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n', 'foo\n']) def test_cat_bz2(self): self.make_mock_file('data/foo.bz2', bz2.compress('foo\n' * 1000)) remote_path = self.fs.path_join('hdfs:///data', 'foo.bz2') self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n'] * 1000) def test_cat_gz(self): self.make_mock_file('data/foo.gz', gzip_compress('foo\n' * 10000)) remote_path = self.fs.path_join('hdfs:///data', 'foo.gz') self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n'] * 10000) def test_du(self): self.make_mock_file('data1', 'abcd') self.make_mock_file('more/data2', 'defg') self.make_mock_file('more/data3', 'hijk') self.assertEqual(self.fs.du('hdfs:///'), 12) self.assertEqual(self.fs.du('hdfs:///data1'), 4) self.assertEqual(self.fs.du('hdfs:///more'), 8) self.assertEqual(self.fs.du('hdfs:///more/*'), 8) self.assertEqual(self.fs.du('hdfs:///more/data2'), 4) self.assertEqual(self.fs.du('hdfs:///more/data3'), 4) def test_mkdir(self): for hadoop_version in ['0.20.0', '0.23.0', '1.2.0', '2.0.0']: self.env['MOCK_HADOOP_VERSION'] = hadoop_version self.fs.mkdir('hdfs:///d') local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd') self.assertEqual(os.path.isdir(local_path), True) def test_path_exists_no(self): path = 'hdfs:///f' self.assertEqual(self.fs.path_exists(path), False) def test_path_exists_yes(self): self.make_mock_file('f') path = 'hdfs:///f' self.assertEqual(self.fs.path_exists(path), True) def test_rm(self): local_path = self.make_mock_file('f') self.assertEqual(os.path.exists(local_path), True) self.fs.rm('hdfs:///f') self.assertEqual(os.path.exists(local_path), False) def test_touchz(self): # mockhadoop doesn't implement this. pass
def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(["hadoop"]) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env["HADOOP_HOME"] = self.makedirs("mock_hadoop_home") self.makefile( os.path.join("mock_hadoop_home", "contrib", "streaming", "hadoop-0.X.Y-streaming.jar"), "i are java bytecode", ) self.env["MOCK_HDFS_ROOT"] = self.makedirs("mock_hdfs_root") self.env["MOCK_HADOOP_OUTPUT"] = self.makedirs("mock_hadoop_output") self.env["USER"] = "******" # don't set MOCK_HADOOP_LOG, we get command history other ways def make_mock_file(self, name, contents="contents"): return self.makefile(os.path.join("mock_hdfs_root", name), contents) def test_ls_empty(self): self.assertEqual(list(self.fs.ls("hdfs:///")), []) def test_ls_basic(self): self.make_mock_file("f") self.assertEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f"]) def test_ls_basic_2(self): self.make_mock_file("f") self.make_mock_file("f2") self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f", "hdfs:///f2"]) def test_ls_recurse(self): self.make_mock_file("f") self.make_mock_file("d/f2") self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f", "hdfs:///d/f2"]) def test_ls_s3n(self): # hadoop fs -lsr doesn't have user and group info when reading from s3 self.make_mock_file("f", "foo") self.make_mock_file("f3 win", "foo" * 10) self.assertItemsEqual(list(self.fs.ls("s3n://bucket/")), ["s3n://bucket/f", "s3n://bucket/f3 win"]) def test_single_space(self): self.make_mock_file("foo bar") self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///foo bar"]) def test_double_space(self): self.make_mock_file("foo bar") self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///foo bar"]) def test_cat_uncompressed(self): self.make_mock_file("data/foo", "foo\nfoo\n") remote_path = self.fs.path_join("hdfs:///data", "foo") self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n", "foo\n"]) def test_cat_bz2(self): self.make_mock_file("data/foo.bz2", bz2.compress("foo\n" * 1000)) remote_path = self.fs.path_join("hdfs:///data", "foo.bz2") self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n"] * 1000) def test_cat_gz(self): self.make_mock_file("data/foo.gz", gzip_compress("foo\n" * 10000)) remote_path = self.fs.path_join("hdfs:///data", "foo.gz") self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n"] * 10000) def test_du(self): self.make_mock_file("data1", "abcd") self.make_mock_file("more/data2", "defg") self.make_mock_file("more/data3", "hijk") self.assertEqual(self.fs.du("hdfs:///"), 12) self.assertEqual(self.fs.du("hdfs:///data1"), 4) self.assertEqual(self.fs.du("hdfs:///more"), 8) self.assertEqual(self.fs.du("hdfs:///more/*"), 8) self.assertEqual(self.fs.du("hdfs:///more/data2"), 4) self.assertEqual(self.fs.du("hdfs:///more/data3"), 4) def test_mkdir(self): for hadoop_version in ["0.20.0", "0.23.0", "1.2.0", "2.0.0"]: self.env["MOCK_HADOOP_VERSION"] = hadoop_version self.fs.mkdir("hdfs:///d") local_path = os.path.join(self.tmp_dir, "mock_hdfs_root", "d") self.assertEqual(os.path.isdir(local_path), True) def test_path_exists_no(self): path = "hdfs:///f" self.assertEqual(self.fs.path_exists(path), False) def test_path_exists_yes(self): self.make_mock_file("f") path = "hdfs:///f" self.assertEqual(self.fs.path_exists(path), True) def test_rm(self): local_path = self.make_mock_file("f") self.assertEqual(os.path.exists(local_path), True) self.fs.rm("hdfs:///f") self.assertEqual(os.path.exists(local_path), False) def test_touchz(self): # mockhadoop doesn't implement this. pass
class HadoopFSTestCase(MockSubprocessTestCase): def setUp(self): super(HadoopFSTestCase, self).setUp() # wrap HadoopFilesystem so it gets cat() self.fs = HadoopFilesystem(['hadoop']) self.set_up_mock_hadoop() self.mock_popen(fs_hadoop, mock_hadoop_main, self.env) def set_up_mock_hadoop(self): # setup fake hadoop home self.env = {} self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home') self.makefile( os.path.join('mock_hadoop_home', 'contrib', 'streaming', 'hadoop-0.X.Y-streaming.jar'), 'i are java bytecode', ) self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root') self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output') self.env['USER'] = '******' # don't set MOCK_HADOOP_LOG, we get command history other ways def make_hdfs_file(self, name, contents): return self.makefile(os.path.join('mock_hdfs_root', name), contents) def test_ls_empty(self): self.assertEqual(list(self.fs.ls('hdfs:///')), []) def test_ls_basic(self): self.make_hdfs_file('f', 'contents') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f']) def test_ls_basic_2(self): self.make_hdfs_file('f', 'contents') self.make_hdfs_file('f2', 'contents') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///f2']) def test_ls_recurse(self): self.make_hdfs_file('f', 'contents') self.make_hdfs_file('d/f2', 'contents') self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f', 'hdfs:///d/f2']) def test_cat_uncompressed(self): # mockhadoop doesn't support compressed files, so we won't test for it. # this is only a sanity check anyway. self.makefile(os.path.join('mock_hdfs_root', 'data', 'foo'), 'foo\nfoo\n') remote_path = self.fs.path_join('hdfs:///data', 'foo') self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n', 'foo\n']) def test_du(self): self.makefile(os.path.join('mock_hdfs_root', 'data1'), 'abcd') self.makedirs('mock_hdfs_root/more') self.makefile(os.path.join('mock_hdfs_root', 'more', 'data2'), 'defg') self.makefile(os.path.join('mock_hdfs_root', 'more', 'data3'), 'hijk') self.assertEqual(self.fs.du('hdfs:///'), 12) self.assertEqual(self.fs.du('hdfs:///data1'), 4) self.assertEqual(self.fs.du('hdfs:///more'), 8) self.assertEqual(self.fs.du('hdfs:///more/*'), 8) self.assertEqual(self.fs.du('hdfs:///more/data2'), 4) self.assertEqual(self.fs.du('hdfs:///more/data3'), 4) def test_mkdir(self): self.fs.mkdir('hdfs:///d') local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd') self.assertEqual(os.path.isdir(local_path), True) def test_rm(self): local_path = self.make_hdfs_file('f', 'contents') self.assertEqual(os.path.exists(local_path), True) self.fs.rm('hdfs:///f') self.assertEqual(os.path.exists(local_path), False) def test_touchz(self): # mockhadoop doesn't implement this. pass
class FindHadoopBinTestCase(SandboxedTestCase): def setUp(self): super(FindHadoopBinTestCase, self).setUp() # track calls to which() self.which = self.start(patch('mrjob.fs.hadoop.which', wraps=which)) # keep which() from searching in /bin, etc. os.environ['PATH'] = self.tmp_dir # create basic HadoopFilesystem (okay to overwrite) self.fs = HadoopFilesystem() def _add_hadoop_bin_for_envvar(self, envvar, *dirnames): """Add a fake "Hadoop" binary to its own subdirectory of ``self.tmp_dir``, and set *os.environ[envvar]* to point at it. You can use *dirnames* to put the binary in a subdirectory of *os.environ[envvar]* (e.g. ``'bin'``). return the path to the fake Hadoop binary. """ os.environ[envvar] = join(self.tmp_dir, envvar.lower()) hadoop_bin_path = join(join(os.environ[envvar], *dirnames), 'hadoop') self.makefile(hadoop_bin_path, executable=True) return hadoop_bin_path # tests without environment variables def test_do_nothing_on_init(self): self.assertFalse(self.which.called) def test_fallback(self): self.assertFalse(self.which.called) self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop']) self.which.assert_called_once_with('hadoop', path=None) def test_predefined_hadoop_bin(self): self.fs = HadoopFilesystem(hadoop_bin=['hadoop', '-v']) self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop', '-v']) self.assertFalse(self.which.called) # environment variable tests def _test_environment_variable(self, envvar, *dirnames): """Check if we can find the hadoop binary from *envvar*""" # okay to add after HadoopFilesystem() created; it hasn't looked yet hadoop_bin = self._add_hadoop_bin_for_envvar(envvar, *dirnames) self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin]) def test_hadoop_prefix(self): self._test_environment_variable('HADOOP_PREFIX', 'bin') def test_hadoop_home_envvar(self): self._test_environment_variable('HADOOP_HOME', 'bin') def test_hadoop_install(self): self._test_environment_variable('HADOOP_INSTALL', 'bin') def test_hadoop_install_hadoop_subdir(self): self._test_environment_variable('HADOOP_INSTALL', 'hadoop', 'bin') def test_path(self): self._test_environment_variable('PATH') def test_two_part_path(self): hadoop_path1 = join(self.tmp_dir, 'path1') hadoop_path1_bin = self.makefile(join(hadoop_path1, 'hadoop'), executable=True) hadoop_path2 = join(self.tmp_dir, 'path2') hadoop_path2_bin = self.makefile(join(hadoop_path2, 'hadoop'), executable=True) os.environ['PATH'] = ':'.join([hadoop_path1, hadoop_path2]) self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_path1_bin]) self.assertNotEqual(self.fs.get_hadoop_bin(), [hadoop_path2_bin]) def test_hadoop_mapred_home(self): self._test_environment_variable('HADOOP_MAPRED_HOME', 'bin') def test_hadoop_anything_home(self): self._test_environment_variable('HADOOP_ANYTHING_HOME', 'bin') def test_other_environment_variable(self): self._add_hadoop_bin_for_envvar('HADOOP_YARN_MRJOB_DIR', 'bin') self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop']) # precedence tests def test_hadoop_prefix_beats_hadoop_home_envvar(self): self._add_hadoop_bin_for_envvar('HADOOP_HOME', 'bin') self.test_hadoop_prefix() def test_hadoop_home_envvar_beats_hadoop_install(self): self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'bin') self.test_hadoop_home_envvar() def test_hadoop_install_beats_hadoop_install_subdir(self): self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'hadoop', 'bin') # verify that this test and test_hadoop_install() use same value # for $HADOOP_INSTALL hadoop_install = os.environ['HADOOP_INSTALL'] self.test_hadoop_install() self.assertEqual(hadoop_install, os.environ['HADOOP_INSTALL']) def test_hadoop_install_hadoop_subdir_beats_path(self): self._add_hadoop_bin_for_envvar('PATH') self.test_hadoop_install_hadoop_subdir() def test_path_beats_hadoop_mapred_home(self): self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin') self.test_path() def test_hadoop_anything_home_is_alphabetical(self): # $HADOOP_ANYTHING_HOME comes before $HADOOP_MAPRED_HOME self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin') self.test_hadoop_anything_home()