Exemple #1
0
    def test_logging_stderr_in_cleanup(self):
        def mock_Popen(*args, **kwargs):
            mock_proc = MagicMock()

            mock_proc.stdout = MagicMock()
            mock_proc.stdout.__iter__.return_value = [b'line1\n', b'line2\n']

            mock_proc.stderr = MagicMock()
            mock_proc.stderr.__iter__.return_value = [
                b'Emergency, everybody to get from street\n'
            ]

            mock_proc.wait.return_value = 0

            return mock_proc

        self.start(patch('mrjob.fs.hadoop.Popen', mock_Popen))

        mock_log = self.start(patch('mrjob.fs.hadoop.log'))

        fs = HadoopFilesystem()

        data = b''.join(fs._cat_file('/some/path'))
        self.assertEqual(data, b'line1\nline2\n')

        mock_log.error.assert_called_once_with(
            'STDERR: Emergency, everybody to get from street')
Exemple #2
0
    def test_logging_stderr_in_cleanup(self):

        def mock_Popen(*args, **kwargs):
            mock_proc = MagicMock()

            mock_proc.stdout = MagicMock()
            mock_proc.stdout.__iter__.return_value = [
                b'line1\n', b'line2\n']

            mock_proc.stderr = MagicMock()
            mock_proc.stderr.__iter__.return_value = [
                b'Emergency, everybody to get from street\n']

            mock_proc.wait.return_value = 0

            return mock_proc

        self.start(patch('mrjob.fs.hadoop.Popen', mock_Popen))

        mock_log = self.start(patch('mrjob.fs.hadoop.log'))

        fs = HadoopFilesystem()

        data = b''.join(fs._cat_file('/some/path'))
        self.assertEqual(data, b'line1\nline2\n')

        mock_log.error.assert_called_once_with(
            'STDERR: Emergency, everybody to get from street')
Exemple #3
0
    def test_deprecated_hadoop_home_option(self):
        hadoop_home = join(self.tmp_dir, 'hadoop_home_option')
        hadoop_bin = self.makefile(join(hadoop_home, 'bin', 'hadoop'),
                                   executable=True)

        # deprecation warning is in HadoopJobRunner
        self.fs = HadoopFilesystem(hadoop_home=hadoop_home)

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin])
Exemple #4
0
    def setUp(self):
        super(FindHadoopBinTestCase, self).setUp()

        # track calls to which()
        self.which = self.start(patch('mrjob.fs.hadoop.which', wraps=which))

        # keep which() from searching in /bin, etc.
        os.environ['PATH'] = self.tmp_dir

        # create basic HadoopFilesystem (okay to overwrite)
        self.fs = HadoopFilesystem()
Exemple #5
0
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['project_id'],
                    location=self._opts['gcs_region'],
                    object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
                ), disable_if=_is_permanent_google_error)

            # Hadoop FS is responsible for all URIs that fall through to it
            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs
Exemple #6
0
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['google_project_id']
                ), disable_if=_is_permanent_google_error)

            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs
Exemple #7
0
 def fs(self):
     """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
     filesystem.
     """
     if self._fs is None:
         self._fs = CompositeFilesystem(
             HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
     return self._fs
Exemple #8
0
    def test_deprecated_hadoop_home_option(self):
        hadoop_home = join(self.tmp_dir, 'hadoop_home_option')
        hadoop_bin = self.makefile(join(hadoop_home, 'bin', 'hadoop'),
                                   executable=True)

        # deprecation warning is in HadoopJobRunner
        self.fs = HadoopFilesystem(hadoop_home=hadoop_home)

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin])
Exemple #9
0
    def setUp(self):
        super(FindHadoopBinTestCase, self).setUp()

        # track calls to which()
        self.which = self.start(patch('mrjob.fs.hadoop.which', wraps=which))

        # keep which() from searching in /bin, etc.
        os.environ['PATH'] = self.tmp_dir

        # create basic HadoopFilesystem (okay to overwrite)
        self.fs = HadoopFilesystem()
Exemple #10
0
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem()

            # don't pass [] to fs; this means not to use hadoop until
            # fs.set_hadoop_bin() is called (used for running hadoop over SSH).
            hadoop_bin = self._opts['hadoop_bin'] or None

            self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin))
            self._fs.add_fs('local', LocalFilesystem())

        return self._fs
Exemple #11
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join('mock_hadoop_home', 'contrib', 'streaming',
                         'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HADOOP_TMP'] = self.makedirs('mock_hadoop')
        self.env['MOCK_HADOOP_VERSION'] = '2.7.1'

        self.env['USER'] = '******'

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(os.path.join(get_mock_hdfs_root(self.env), name),
                             contents)

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.join('hdfs:///data', 'foo')

        self.assertEqual(b''.join(self.fs._cat_file(remote_path)),
                         b'foo\nfoo\n')

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000))

        remote_path = self.fs.join('hdfs:///data', 'foo.bz2')

        self.assertEqual(b''.join(self.fs._cat_file(remote_path)),
                         b'foo\n' * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000))

        remote_path = self.fs.join('hdfs:///data', 'foo.gz')

        self.assertEqual(b''.join(self.fs._cat_file(remote_path)),
                         b'foo\n' * 10000)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///d/f2', 'hdfs:///f'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_ls_s3a(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3a://bucket/')),
                         ['s3a://bucket/f', 's3a://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')), ['hdfs:///foo  bar'])

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_du_non_existent(self):
        self.assertEqual(self.fs.du('hdfs:///does-not-exist'), 0)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d/ave')
        local_path = os.path.join(get_mock_hdfs_root(self.env), 'd', 'ave')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), False)

    def test_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_rm_recursive(self):
        local_path = self.make_mock_file('foo/bar')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///foo')  # remove containing directory
        self.assertEqual(os.path.exists(local_path), False)

    def test_rm_nonexistent(self):
        self.fs.rm('hdfs:///baz')

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
Exemple #12
0
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_hdfs_file(self, name, contents='contents'):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def make_hdfs_dir(self, name):
        return self.makedirs(os.path.join('mock_hdfs_root', name))

    def make_hdfs_tree(self, path, files=None):
        if files is None:
            files = ('f', 'g/a/b', 'g/a/a/b')
        test_files = []
        for f in sorted(files):
            f = os.path.join(path, f)
            self.make_hdfs_file(f, f)
            test_files.append("hdfs:///" + f)
        self.assertEqual(
            sorted(self.fs.ls("hdfs:///" + path.rstrip('/') + '/*')),
            test_files
        )
        return path

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_hdfs_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_hdfs_file('f')
        self.make_hdfs_file('f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f',
                                                        'hdfs:///f2'])
    def test_ls_recurse(self):
        self.make_hdfs_file('f')
        self.make_hdfs_file('d/f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///d/f2'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_hdfs_file('f', 'foo')
        self.make_hdfs_file('f3 win', 'foo' * 10)
        self.assertItemsEqual(list(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_single_space(self):
        self.make_hdfs_file('foo bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_hdfs_file('foo  bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///foo  bar'])

    def test_cat_uncompressed(self):
        # mockhadoop doesn't support compressed files, so we won't test for it.
        # this is only a sanity check anyway.
        self.make_hdfs_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n', 'foo\n'])

    def test_write_str(self):
        path = 'hdfs:///write-test-str'
        content = 'some content!'
        self.fs.write(path, content)
        self.assertEqual("".join(self.fs.cat(path)), content)

    def test_write_file(self):
        path = 'hdfs:///write-test-fileobj'
        content = StringIO('some content!')
        self.fs.write(path, content)
        self.assertEqual("".join(self.fs.cat(path)), content.getvalue())

    def test_write_overwrite(self):
        self.make_hdfs_file('existing', 'this file already exists')
        self.assertRaises(OSError, self.fs.write, 'hdfs:///existing',
                          'can not overwrite')

    def test_copy_from_local(self):
        content = 'file filler'
        dst = 'hdfs:///hadoop-copy'
        src = self.makefile('local-source', content)

        self.fs.copy_from_local(dst, src)
        self.assertEqual("".join(self.fs.cat(dst)), content)

    def test_copy_from_local_override(self):
        src = self.makefile('local-source', 'source')
        self.make_hdfs_file('existing', 'this file already exists')
        self.assertRaises(OSError, self.fs.copy_from_local,
                          'hdfs:///existing', src)

    def test_du(self):
        self.make_hdfs_file('data1', 'abcd')
        self.make_hdfs_file('more/data2', 'defg')
        self.make_hdfs_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d')
        local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_hdfs_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_hdfs_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_rm_tree_noslash_files(self):
        path = "icio/goodbye-1"
        hdfs_path = "hdfs:///%s" % path
        real_path = self.make_hdfs_dir(path)
        self.make_hdfs_tree(path)

        self.fs.rm(hdfs_path.rstrip("/"))

        # Check that the directory and its files have been removed
        self.assertEqual(os.path.isdir(real_path), False)
        self.assertEqual(self.fs.path_exists(path), False)
        self.assertEqual(list(self.fs.ls(hdfs_path)), [])

    def test_rm_tree_slash_files(self):
        path = "icio/goodbye-2"
        hdfs_path = "hdfs:///%s" % path
        real_path = self.make_hdfs_dir(path)
        self.make_hdfs_tree(path)

        self.fs.rm(hdfs_path.rstrip("/") + "/")

        # Check that the directory and its files have been removed
        self.assertEqual(os.path.isdir(real_path), False)
        self.assertEqual(self.fs.path_exists(hdfs_path), False)
        self.assertEqual(list(self.fs.ls(hdfs_path)), [])

    def test_rm_tree_star_files(self):
        path = "icio/goodbye-3"
        hdfs_path = "hdfs:///%s" % path
        real_path = self.make_hdfs_dir(path)
        self.make_hdfs_tree('icio/goodbye-3')

        self.fs.rm(hdfs_path.rstrip("/") + "/*")

        # Check that the files have been removed but not the root directory
        self.assertEqual(os.path.isdir(real_path), True)
        self.assertEqual(self.fs.path_exists(hdfs_path), True)
        self.assertEqual(list(self.fs.ls(hdfs_path)), [])

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
Exemple #13
0
    def test_predefined_hadoop_bin(self):
        self.fs = HadoopFilesystem(hadoop_bin=['hadoop', '-v'])

        self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop', '-v'])

        self.assertFalse(self.which.called)
Exemple #14
0
class FindHadoopBinTestCase(SandboxedTestCase):
    def setUp(self):
        super(FindHadoopBinTestCase, self).setUp()

        # track calls to which()
        self.which = self.start(patch('mrjob.fs.hadoop.which', wraps=which))

        # keep which() from searching in /bin, etc.
        os.environ['PATH'] = self.tmp_dir

        # create basic HadoopFilesystem (okay to overwrite)
        self.fs = HadoopFilesystem()

    def _add_hadoop_bin_for_envvar(self, envvar, *dirnames):
        """Add a fake "Hadoop" binary to its own subdirectory of
        ``self.tmp_dir``, and set *os.environ[envvar]* to point at it. You can
        use *dirnames* to put the binary in a subdirectory of
        *os.environ[envvar]* (e.g. ``'bin'``).

        return the path to the fake Hadoop binary.
        """
        os.environ[envvar] = join(self.tmp_dir, envvar.lower())

        hadoop_bin_path = join(join(os.environ[envvar], *dirnames), 'hadoop')

        self.makefile(hadoop_bin_path, executable=True)

        return hadoop_bin_path

    # tests without environment variables

    def test_do_nothing_on_init(self):
        self.assertFalse(self.which.called)

    def test_fallback(self):
        self.assertFalse(self.which.called)

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop'])

        self.which.assert_called_once_with('hadoop', path=None)

    def test_predefined_hadoop_bin(self):
        self.fs = HadoopFilesystem(hadoop_bin=['hadoop', '-v'])

        self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop', '-v'])

        self.assertFalse(self.which.called)

    # environment variable tests

    def _test_environment_variable(self, envvar, *dirnames):
        """Check if we can find the hadoop binary from *envvar*"""
        # okay to add after HadoopFilesystem() created; it hasn't looked yet
        hadoop_bin = self._add_hadoop_bin_for_envvar(envvar, *dirnames)

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin])

    def test_hadoop_prefix(self):
        self._test_environment_variable('HADOOP_PREFIX', 'bin')

    def test_hadoop_home_envvar(self):
        self._test_environment_variable('HADOOP_HOME', 'bin')

    def test_hadoop_install(self):
        self._test_environment_variable('HADOOP_INSTALL', 'bin')

    def test_hadoop_install_hadoop_subdir(self):
        self._test_environment_variable('HADOOP_INSTALL', 'hadoop', 'bin')

    def test_path(self):
        self._test_environment_variable('PATH')

    def test_two_part_path(self):
        hadoop_path1 = join(self.tmp_dir, 'path1')
        hadoop_path1_bin = self.makefile(join(hadoop_path1, 'hadoop'),
                                         executable=True)
        hadoop_path2 = join(self.tmp_dir, 'path2')
        hadoop_path2_bin = self.makefile(join(hadoop_path2, 'hadoop'),
                                         executable=True)

        os.environ['PATH'] = ':'.join([hadoop_path1, hadoop_path2])

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_path1_bin])
            self.assertNotEqual(self.fs.get_hadoop_bin(), [hadoop_path2_bin])

    def test_hadoop_mapred_home(self):
        self._test_environment_variable('HADOOP_MAPRED_HOME', 'bin')

    def test_hadoop_anything_home(self):
        self._test_environment_variable('HADOOP_ANYTHING_HOME', 'bin')

    def test_other_environment_variable(self):
        self._add_hadoop_bin_for_envvar('HADOOP_YARN_MRJOB_DIR', 'bin')

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop'])

    # precedence tests

    def test_hadoop_prefix_beats_hadoop_home_envvar(self):
        self._add_hadoop_bin_for_envvar('HADOOP_HOME', 'bin')
        self.test_hadoop_prefix()

    def test_hadoop_home_envvar_beats_hadoop_install(self):
        self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'bin')
        self.test_hadoop_home_envvar()

    def test_hadoop_install_beats_hadoop_install_subdir(self):
        self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'hadoop', 'bin')
        # verify that this test and test_hadoop_install() use same value
        # for $HADOOP_INSTALL
        hadoop_install = os.environ['HADOOP_INSTALL']

        self.test_hadoop_install()

        self.assertEqual(hadoop_install, os.environ['HADOOP_INSTALL'])

    def test_hadoop_install_hadoop_subdir_beats_path(self):
        self._add_hadoop_bin_for_envvar('PATH')
        self.test_hadoop_install_hadoop_subdir()

    def test_path_beats_hadoop_mapred_home(self):
        self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin')
        self.test_path()

    def test_hadoop_anything_home_is_alphabetical(self):
        # $HADOOP_ANYTHING_HOME comes before $HADOOP_MAPRED_HOME
        self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin')
        self.test_hadoop_anything_home()
Exemple #15
0
 def setUp(self):
     super(HadoopFSTestCase, self).setUp()
     # wrap HadoopFilesystem so it gets cat()
     self.fs = HadoopFilesystem(["hadoop"])
     self.set_up_mock_hadoop()
     self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)
Exemple #16
0
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_hdfs_file(self, name, contents):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_hdfs_file('f', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f',
                                                        'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('d/f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///d/f2'])

    def test_cat_uncompressed(self):
        # mockhadoop doesn't support compressed files, so we won't test for it.
        # this is only a sanity check anyway.
        self.makefile(os.path.join('mock_hdfs_root', 'data', 'foo'), 'foo\nfoo\n')
        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)), ['foo\n', 'foo\n'])

    def test_du(self):
        self.makefile(os.path.join('mock_hdfs_root', 'data1'), 'abcd')
        self.makedirs('mock_hdfs_root/more')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data2'), 'defg')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data3'), 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d')
        local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_rm(self):
        local_path = self.make_hdfs_file('f', 'contents')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
Exemple #17
0
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///d/f2', 'hdfs:///f'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo  bar'])

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         [b'foo\n', b'foo\n'])

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.bz2')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         [b'foo\n'] * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.gz')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         [b'foo\n'] * 10000)

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        for hadoop_version in ['0.20.0', '0.23.0', '1.2.0', '2.0.0']:
            self.env['MOCK_HADOOP_VERSION'] = hadoop_version
            self.fs.mkdir('hdfs:///d')
            local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
            self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
Exemple #18
0
class HadoopFSTestCase(MockSubprocessTestCase):

    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join(
                'mock_hadoop_home',
                'contrib',
                'streaming',
                'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HADOOP_TMP'] = self.makedirs('mock_hadoop')
        self.env['MOCK_HADOOP_VERSION'] = '2.7.1'

        self.env['USER'] = '******'

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(
            os.path.join(get_mock_hdfs_root(self.env), name), contents)

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.join('hdfs:///data', 'foo')

        self.assertEqual(
            b''.join(self.fs._cat_file(remote_path)),
            b'foo\nfoo\n')

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress(b'foo\n' * 1000))

        remote_path = self.fs.join('hdfs:///data', 'foo.bz2')

        self.assertEqual(
            b''.join(self.fs._cat_file(remote_path)),
            b'foo\n' * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress(b'foo\n' * 10000))

        remote_path = self.fs.join('hdfs:///data', 'foo.gz')

        self.assertEqual(
            b''.join(self.fs._cat_file(remote_path)),
            b'foo\n' * 10000)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///d/f2', 'hdfs:///f'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3n://bucket/')),
                         ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_ls_s3a(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertEqual(sorted(self.fs.ls('s3a://bucket/')),
                         ['s3a://bucket/f', 's3a://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertEqual(sorted(self.fs.ls('hdfs:///')),
                         ['hdfs:///foo  bar'])

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_du_non_existent(self):
        self.assertEqual(self.fs.du('hdfs:///does-not-exist'), 0)

    def test_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), False)

    def test_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.exists(path), True)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d/ave')
        path_in_mock_hdfs = os.path.join(
            get_mock_hdfs_root(self.env), 'd', 'ave')
        self.assertEqual(os.path.isdir(path_in_mock_hdfs), True)

    def test_put(self):
        local_path = self.makefile('foo', contents=b'bar')
        dest = 'hdfs:///bar'

        self.fs.put(local_path, dest)
        self.assertEqual(b''.join(self.fs.cat(dest)), b'bar')

    def test_no_put_to_dir(self):
        local_path = self.makefile('foo', contents=b'bar')

        self.assertRaises(ValueError, self.fs.put, local_path, 'hdfs:///')

    def test_rm(self):
        path_in_mock_hdfs = self.make_mock_file('f')
        self.assertEqual(os.path.exists(path_in_mock_hdfs), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(path_in_mock_hdfs), False)

    def test_rm_recursive(self):
        path_in_mock_hdfs = self.make_mock_file('foo/bar')
        self.assertEqual(os.path.exists(path_in_mock_hdfs), True)
        self.fs.rm('hdfs:///foo')  # remove containing directory
        self.assertEqual(os.path.exists(path_in_mock_hdfs), False)

    def test_rm_nonexistent(self):
        self.fs.rm('hdfs:///baz')

    def test_touchz(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

        self.fs.touchz('hdfs:///empty')

        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///empty'])
Exemple #19
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join('mock_hadoop_home', 'contrib', 'streaming',
                         'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_mock_file(self, name, contents='contents'):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_mock_file('f')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_mock_file('f')
        self.make_mock_file('f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_mock_file('f')
        self.make_mock_file('d/f2')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///f', 'hdfs:///d/f2'])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file('f', 'foo')
        self.make_mock_file('f3 win', 'foo' * 10)
        self.assertItemsEqual(list(self.fs.ls('s3n://bucket/')),
                              ['s3n://bucket/f', 's3n://bucket/f3 win'])

    def test_single_space(self):
        self.make_mock_file('foo bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///foo bar'])

    def test_double_space(self):
        self.make_mock_file('foo  bar')
        self.assertItemsEqual(list(self.fs.ls('hdfs:///')),
                              ['hdfs:///foo  bar'])

    def test_cat_uncompressed(self):
        self.make_mock_file('data/foo', 'foo\nfoo\n')

        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n', 'foo\n'])

    def test_cat_bz2(self):
        self.make_mock_file('data/foo.bz2', bz2.compress('foo\n' * 1000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.bz2')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n'] * 1000)

    def test_cat_gz(self):
        self.make_mock_file('data/foo.gz', gzip_compress('foo\n' * 10000))

        remote_path = self.fs.path_join('hdfs:///data', 'foo.gz')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n'] * 10000)

    def test_du(self):
        self.make_mock_file('data1', 'abcd')
        self.make_mock_file('more/data2', 'defg')
        self.make_mock_file('more/data3', 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        for hadoop_version in ['0.20.0', '0.23.0', '1.2.0', '2.0.0']:
            self.env['MOCK_HADOOP_VERSION'] = hadoop_version
            self.fs.mkdir('hdfs:///d')
            local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
            self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_mock_file('f')
        path = 'hdfs:///f'
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file('f')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
Exemple #20
0
    def test_predefined_hadoop_bin(self):
        self.fs = HadoopFilesystem(hadoop_bin=['hadoop', '-v'])

        self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop', '-v'])

        self.assertFalse(self.which.called)
Exemple #21
0
 def setUp(self):
     super(HadoopFSTestCase, self).setUp()
     # wrap HadoopFilesystem so it gets cat()
     self.fs = HadoopFilesystem(['hadoop'])
     self.set_up_mock_hadoop()
     self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)
Exemple #22
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(["hadoop"])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env["HADOOP_HOME"] = self.makedirs("mock_hadoop_home")

        self.makefile(
            os.path.join("mock_hadoop_home", "contrib", "streaming", "hadoop-0.X.Y-streaming.jar"),
            "i are java bytecode",
        )

        self.env["MOCK_HDFS_ROOT"] = self.makedirs("mock_hdfs_root")
        self.env["MOCK_HADOOP_OUTPUT"] = self.makedirs("mock_hadoop_output")
        self.env["USER"] = "******"
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_mock_file(self, name, contents="contents"):
        return self.makefile(os.path.join("mock_hdfs_root", name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls("hdfs:///")), [])

    def test_ls_basic(self):
        self.make_mock_file("f")
        self.assertEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f"])

    def test_ls_basic_2(self):
        self.make_mock_file("f")
        self.make_mock_file("f2")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f", "hdfs:///f2"])

    def test_ls_recurse(self):
        self.make_mock_file("f")
        self.make_mock_file("d/f2")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///f", "hdfs:///d/f2"])

    def test_ls_s3n(self):
        # hadoop fs -lsr doesn't have user and group info when reading from s3
        self.make_mock_file("f", "foo")
        self.make_mock_file("f3 win", "foo" * 10)
        self.assertItemsEqual(list(self.fs.ls("s3n://bucket/")), ["s3n://bucket/f", "s3n://bucket/f3 win"])

    def test_single_space(self):
        self.make_mock_file("foo bar")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///foo bar"])

    def test_double_space(self):
        self.make_mock_file("foo  bar")
        self.assertItemsEqual(list(self.fs.ls("hdfs:///")), ["hdfs:///foo  bar"])

    def test_cat_uncompressed(self):
        self.make_mock_file("data/foo", "foo\nfoo\n")

        remote_path = self.fs.path_join("hdfs:///data", "foo")

        self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n", "foo\n"])

    def test_cat_bz2(self):
        self.make_mock_file("data/foo.bz2", bz2.compress("foo\n" * 1000))

        remote_path = self.fs.path_join("hdfs:///data", "foo.bz2")

        self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n"] * 1000)

    def test_cat_gz(self):
        self.make_mock_file("data/foo.gz", gzip_compress("foo\n" * 10000))

        remote_path = self.fs.path_join("hdfs:///data", "foo.gz")

        self.assertEqual(list(self.fs._cat_file(remote_path)), ["foo\n"] * 10000)

    def test_du(self):
        self.make_mock_file("data1", "abcd")
        self.make_mock_file("more/data2", "defg")
        self.make_mock_file("more/data3", "hijk")

        self.assertEqual(self.fs.du("hdfs:///"), 12)
        self.assertEqual(self.fs.du("hdfs:///data1"), 4)
        self.assertEqual(self.fs.du("hdfs:///more"), 8)
        self.assertEqual(self.fs.du("hdfs:///more/*"), 8)
        self.assertEqual(self.fs.du("hdfs:///more/data2"), 4)
        self.assertEqual(self.fs.du("hdfs:///more/data3"), 4)

    def test_mkdir(self):
        for hadoop_version in ["0.20.0", "0.23.0", "1.2.0", "2.0.0"]:
            self.env["MOCK_HADOOP_VERSION"] = hadoop_version
            self.fs.mkdir("hdfs:///d")
            local_path = os.path.join(self.tmp_dir, "mock_hdfs_root", "d")
            self.assertEqual(os.path.isdir(local_path), True)

    def test_path_exists_no(self):
        path = "hdfs:///f"
        self.assertEqual(self.fs.path_exists(path), False)

    def test_path_exists_yes(self):
        self.make_mock_file("f")
        path = "hdfs:///f"
        self.assertEqual(self.fs.path_exists(path), True)

    def test_rm(self):
        local_path = self.make_mock_file("f")
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm("hdfs:///f")
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
Exemple #23
0
class HadoopFSTestCase(MockSubprocessTestCase):
    def setUp(self):
        super(HadoopFSTestCase, self).setUp()
        # wrap HadoopFilesystem so it gets cat()
        self.fs = HadoopFilesystem(['hadoop'])
        self.set_up_mock_hadoop()
        self.mock_popen(fs_hadoop, mock_hadoop_main, self.env)

    def set_up_mock_hadoop(self):
        # setup fake hadoop home
        self.env = {}
        self.env['HADOOP_HOME'] = self.makedirs('mock_hadoop_home')

        self.makefile(
            os.path.join('mock_hadoop_home', 'contrib', 'streaming',
                         'hadoop-0.X.Y-streaming.jar'),
            'i are java bytecode',
        )

        self.env['MOCK_HDFS_ROOT'] = self.makedirs('mock_hdfs_root')
        self.env['MOCK_HADOOP_OUTPUT'] = self.makedirs('mock_hadoop_output')
        self.env['USER'] = '******'
        # don't set MOCK_HADOOP_LOG, we get command history other ways

    def make_hdfs_file(self, name, contents):
        return self.makefile(os.path.join('mock_hdfs_root', name), contents)

    def test_ls_empty(self):
        self.assertEqual(list(self.fs.ls('hdfs:///')), [])

    def test_ls_basic(self):
        self.make_hdfs_file('f', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')), ['hdfs:///f'])

    def test_ls_basic_2(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///f2'])

    def test_ls_recurse(self):
        self.make_hdfs_file('f', 'contents')
        self.make_hdfs_file('d/f2', 'contents')
        self.assertEqual(list(self.fs.ls('hdfs:///')),
                         ['hdfs:///f', 'hdfs:///d/f2'])

    def test_cat_uncompressed(self):
        # mockhadoop doesn't support compressed files, so we won't test for it.
        # this is only a sanity check anyway.
        self.makefile(os.path.join('mock_hdfs_root', 'data', 'foo'),
                      'foo\nfoo\n')
        remote_path = self.fs.path_join('hdfs:///data', 'foo')

        self.assertEqual(list(self.fs._cat_file(remote_path)),
                         ['foo\n', 'foo\n'])

    def test_du(self):
        self.makefile(os.path.join('mock_hdfs_root', 'data1'), 'abcd')
        self.makedirs('mock_hdfs_root/more')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data2'), 'defg')
        self.makefile(os.path.join('mock_hdfs_root', 'more', 'data3'), 'hijk')

        self.assertEqual(self.fs.du('hdfs:///'), 12)
        self.assertEqual(self.fs.du('hdfs:///data1'), 4)
        self.assertEqual(self.fs.du('hdfs:///more'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/*'), 8)
        self.assertEqual(self.fs.du('hdfs:///more/data2'), 4)
        self.assertEqual(self.fs.du('hdfs:///more/data3'), 4)

    def test_mkdir(self):
        self.fs.mkdir('hdfs:///d')
        local_path = os.path.join(self.tmp_dir, 'mock_hdfs_root', 'd')
        self.assertEqual(os.path.isdir(local_path), True)

    def test_rm(self):
        local_path = self.make_hdfs_file('f', 'contents')
        self.assertEqual(os.path.exists(local_path), True)
        self.fs.rm('hdfs:///f')
        self.assertEqual(os.path.exists(local_path), False)

    def test_touchz(self):
        # mockhadoop doesn't implement this.
        pass
Exemple #24
0
class FindHadoopBinTestCase(SandboxedTestCase):

    def setUp(self):
        super(FindHadoopBinTestCase, self).setUp()

        # track calls to which()
        self.which = self.start(patch('mrjob.fs.hadoop.which', wraps=which))

        # keep which() from searching in /bin, etc.
        os.environ['PATH'] = self.tmp_dir

        # create basic HadoopFilesystem (okay to overwrite)
        self.fs = HadoopFilesystem()

    def _add_hadoop_bin_for_envvar(self, envvar, *dirnames):
        """Add a fake "Hadoop" binary to its own subdirectory of
        ``self.tmp_dir``, and set *os.environ[envvar]* to point at it. You can
        use *dirnames* to put the binary in a subdirectory of
        *os.environ[envvar]* (e.g. ``'bin'``).

        return the path to the fake Hadoop binary.
        """
        os.environ[envvar] = join(self.tmp_dir, envvar.lower())

        hadoop_bin_path = join(join(os.environ[envvar], *dirnames), 'hadoop')

        self.makefile(hadoop_bin_path, executable=True)

        return hadoop_bin_path

    # tests without environment variables

    def test_do_nothing_on_init(self):
        self.assertFalse(self.which.called)

    def test_fallback(self):
        self.assertFalse(self.which.called)

        self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop'])

        self.which.assert_called_once_with('hadoop', path=None)

    def test_predefined_hadoop_bin(self):
        self.fs = HadoopFilesystem(hadoop_bin=['hadoop', '-v'])

        self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop', '-v'])

        self.assertFalse(self.which.called)

    # environment variable tests

    def _test_environment_variable(self, envvar, *dirnames):
        """Check if we can find the hadoop binary from *envvar*"""
        # okay to add after HadoopFilesystem() created; it hasn't looked yet
        hadoop_bin = self._add_hadoop_bin_for_envvar(envvar, *dirnames)

        self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin])

    def test_hadoop_prefix(self):
        self._test_environment_variable('HADOOP_PREFIX', 'bin')

    def test_hadoop_home_envvar(self):
        self._test_environment_variable('HADOOP_HOME', 'bin')

    def test_hadoop_install(self):
        self._test_environment_variable('HADOOP_INSTALL', 'bin')

    def test_hadoop_install_hadoop_subdir(self):
        self._test_environment_variable('HADOOP_INSTALL', 'hadoop', 'bin')

    def test_path(self):
        self._test_environment_variable('PATH')

    def test_two_part_path(self):
        hadoop_path1 = join(self.tmp_dir, 'path1')
        hadoop_path1_bin = self.makefile(join(hadoop_path1, 'hadoop'),
                                         executable=True)
        hadoop_path2 = join(self.tmp_dir, 'path2')
        hadoop_path2_bin = self.makefile(join(hadoop_path2, 'hadoop'),
                                         executable=True)

        os.environ['PATH'] = ':'.join([hadoop_path1, hadoop_path2])

        self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_path1_bin])
        self.assertNotEqual(self.fs.get_hadoop_bin(), [hadoop_path2_bin])

    def test_hadoop_mapred_home(self):
        self._test_environment_variable('HADOOP_MAPRED_HOME', 'bin')

    def test_hadoop_anything_home(self):
        self._test_environment_variable('HADOOP_ANYTHING_HOME', 'bin')

    def test_other_environment_variable(self):
        self._add_hadoop_bin_for_envvar('HADOOP_YARN_MRJOB_DIR', 'bin')

        self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop'])

    # precedence tests

    def test_hadoop_prefix_beats_hadoop_home_envvar(self):
        self._add_hadoop_bin_for_envvar('HADOOP_HOME', 'bin')
        self.test_hadoop_prefix()

    def test_hadoop_home_envvar_beats_hadoop_install(self):
        self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'bin')
        self.test_hadoop_home_envvar()

    def test_hadoop_install_beats_hadoop_install_subdir(self):
        self._add_hadoop_bin_for_envvar('HADOOP_INSTALL', 'hadoop', 'bin')
        # verify that this test and test_hadoop_install() use same value
        # for $HADOOP_INSTALL
        hadoop_install = os.environ['HADOOP_INSTALL']

        self.test_hadoop_install()

        self.assertEqual(hadoop_install, os.environ['HADOOP_INSTALL'])

    def test_hadoop_install_hadoop_subdir_beats_path(self):
        self._add_hadoop_bin_for_envvar('PATH')
        self.test_hadoop_install_hadoop_subdir()

    def test_path_beats_hadoop_mapred_home(self):
        self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin')
        self.test_path()

    def test_hadoop_anything_home_is_alphabetical(self):
        # $HADOOP_ANYTHING_HOME comes before $HADOOP_MAPRED_HOME
        self._add_hadoop_bin_for_envvar('HADOOP_MAPRED_HOME', 'bin')
        self.test_hadoop_anything_home()