Ejemplo n.º 1
0
    def test_apply_hdfs_snapshot(self):
        _config_file = os.path.join(os.path.dirname(__file__),
                                    'resources',
                                    'bootsrap',
                                    'bootstrap.ini')
        _raw_sales_dir = HDFS('/tmp/raw/sales')
        _raw_users_dir = HDFS('/tmp/raw/users')
        _raw_tmp_dir = HDFS('/tmp/raw/tmp')
        try:
            # run bootstrap script
            metastore = IniFileMetaStore(file=_config_file)
            _config = Configuration.load(metastore)
            apply_hdfs_snapshot(_config)

            # asserts
            # assert directories were created
            self.assertTrue(_raw_sales_dir.exists(), "Directory '/tmp/raw/sales' was not created")
            self.assertTrue(_raw_users_dir.exists(), "Directory '/tmp/raw/users' was not created")
            self.assertTrue(_raw_tmp_dir.exists(), "Directory '/tmp/raw/tmp' was not created")
            # assert acls were applied
            sales_dir_acls = _raw_sales_dir.get_acls()
            users_dir_acls = _raw_users_dir.get_acls()

            self.assertIsNotNone(sales_dir_acls, '/tmp/raw/sales : ACL were not applied')
            self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/sales : pii acl was not applied')
            self.assertTrue('group:sales:r--' in sales_dir_acls, '/tmp/raw/sales : salse acl was not applied')

            self.assertIsNotNone(users_dir_acls, '/tmp/raw/users : ACL were not applied')
            self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/users : pii acl was not applied')
        finally:
            _test_basedir = HDFS('/tmp/raw')
            _test_basedir.delete_directory()
            self.assertFalse(_test_basedir.exists(), "ERROR: clean up failed")
Ejemplo n.º 2
0
 def test_merge(self):
     basedir = os.path.dirname(os.path.realpath(__file__))
     local = LocalFS(os.path.join(basedir, "resources", "test_merge"))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt"))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         hdfs_file.merge(merged_file.path)
         self.assertTrue(merged_file.exists(), "merged file was not copied to local fs")
     finally:
         hdfs_file.delete_directory()
         merged_file.delete()
Ejemplo n.º 3
0
 def test_merge(self):
     basedir = os.path.dirname(os.path.realpath(__file__))
     local = LocalFS(os.path.join(basedir, "resources", "test_merge"))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt"))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         hdfs_file.merge(merged_file.path)
         self.assertTrue(merged_file.exists(), "merged file was not copied to local fs")
     finally:
         hdfs_file.delete_directory()
         merged_file.delete()
Ejemplo n.º 4
0
 def test_get_modification_time(self):
     now = datetime.now().strftime("%Y-%m-%d")
     _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         _dir.create_directory()
         _file.create_file()
         self.assertTrue(_dir.exists(), "Dir was not created")
         self.assertTrue(_file.exists(), "File was not created")
         self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time")
         self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time")
     finally:
         _dir.delete_directory()
         _file.delete()
Ejemplo n.º 5
0
 def test_get_modification_time(self):
     now = datetime.now().strftime("%Y-%m-%d")
     _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         _dir.create_directory()
         _file.create_file()
         self.assertTrue(_dir.exists(), "Dir was not created")
         self.assertTrue(_file.exists(), "File was not created")
         self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time")
         self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time")
     finally:
         _dir.delete_directory()
         _file.delete()
Ejemplo n.º 6
0
 def test_move_non_empty_dir(self):
     dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
     _dir = None
     try:
         _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4())))
         self.assertFalse(dst.exists(), "dst directory is already exists")
         _dir.move(dst.path)
         self.assertFalse(_dir.exists(), "original directory should be deleted")
         self.assertTrue(dst.exists(), "directory move operation failed")
     finally:
         if _dir:
             _dir.delete_directory()
             self.assertFalse(_dir.exists(), "Folder was not deleted")
         dst.delete_directory()
         self.assertFalse(dst.exists(), "Dst Folder was not deleted")
Ejemplo n.º 7
0
    def test_streaming_job_without_reducer(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            job = self._template_streaming_job_(base_dir=_job_basedir.path, map_only_job=True)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("Cannot run map-only job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())

            #   check counters
            self.assertEqual(2, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(11, _job_status.counter(group='Map-Reduce Framework', counter='Map input records'))
            self.assertEqual(3252, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _job_basedir.delete_directory()
Ejemplo n.º 8
0
 def test_move_non_empty_dir(self):
     dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
     _dir = None
     try:
         _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4())))
         self.assertFalse(dst.exists(), "dst directory is already exists")
         _dir.move(dst.path)
         self.assertFalse(_dir.exists(), "original directory should be deleted")
         self.assertTrue(dst.exists(), "directory move operation failed")
     finally:
         if _dir:
             _dir.delete_directory()
             self.assertFalse(_dir.exists(), "Folder was not deleted")
         dst.delete_directory()
         self.assertFalse(dst.exists(), "Dst Folder was not deleted")
Ejemplo n.º 9
0
 def test_streaming_job(self):
     _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         job = self._template_streaming_job_(base_dir=_job_basedir.path)
         command_result = job.run()
         command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed"))
         _job_status = job.status()
         self.assertTrue(_job_status is not None and _job_status.is_succeeded())
         # counters
         self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Spilled Records'),
                          "counters['Map-Reduce Framework']['Spilled Records']")
         self.assertEqual(143, _job_status.counter(group='Map-Reduce Framework', counter='Reduce output records'),
                          "counters['Map-Reduce Framework']['Reduce output records']")
         self.assertEqual(370, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'),
                          "counters['Map-Reduce Framework']['Reduce input records']")
     finally:
         _job_basedir.delete_directory()
Ejemplo n.º 10
0
    def test_mr_job_command_generation_with_arguments(self):
        _job_name = "test_mr_job_%s" % uuid.uuid4()

        _base_dir = HDFS(os.path.join("/tmp", _job_name))
        _base_dir.create_directory()
        try:
            jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar')
            # configure job inputs
            _job_input = HDFS(os.path.join(_base_dir.path, "input"))
            _job_input.create_directory()
            LocalFS(os.path.join(
                os.path.dirname(__file__),
                'resources',
                'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _job_input.path
            )

            # configure job output
            _job_output = HDFS(os.path.join(_base_dir.path, "output"))
            if not os.path.exists(jar):
                self.skipTest("'%s' not found" % jar)

            job = MapReduce.prepare_mapreduce_job(jar=jar,
                                                  main_class="wordcount",
                                                  name=_job_name) \
                .with_config_option("split.by", "'\\t'") \
                .with_number_of_reducers(3) \
                .with_arguments(
                _job_input.path,
                _job_output.path
            )
            _command_submission_result = job.run()
            _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed")
            self.assertTrue(_job_output.exists(), "Error: empty job output")
            #     check counters
            self.assertEqual(6, _job_status.counter(group='File System Counters',
                                                    counter='HDFS: Number of write operations'))
            self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks'))
            self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _base_dir.delete_directory()
Ejemplo n.º 11
0
    def test_streaming_job_with_multiple_inputs(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:

            job = self._template_streaming_job_(base_dir=_job_basedir.path)

            _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2"))
            _additional_datasource.create_directory()
            LocalFS(os.path.join(os.path.dirname(__file__), 'resources',
                                 'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _additional_datasource.path)
            job.take(_additional_datasource.path)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())
            # check counters
            self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'),
                             "counters['Map-Reduce Framework']['Reduce input records']")
        finally:
            _job_basedir.delete_directory()