def test_apply_hdfs_snapshot(self): _config_file = os.path.join(os.path.dirname(__file__), 'resources', 'bootsrap', 'bootstrap.ini') _raw_sales_dir = HDFS('/tmp/raw/sales') _raw_users_dir = HDFS('/tmp/raw/users') _raw_tmp_dir = HDFS('/tmp/raw/tmp') try: # run bootstrap script metastore = IniFileMetaStore(file=_config_file) _config = Configuration.load(metastore) apply_hdfs_snapshot(_config) # asserts # assert directories were created self.assertTrue(_raw_sales_dir.exists(), "Directory '/tmp/raw/sales' was not created") self.assertTrue(_raw_users_dir.exists(), "Directory '/tmp/raw/users' was not created") self.assertTrue(_raw_tmp_dir.exists(), "Directory '/tmp/raw/tmp' was not created") # assert acls were applied sales_dir_acls = _raw_sales_dir.get_acls() users_dir_acls = _raw_users_dir.get_acls() self.assertIsNotNone(sales_dir_acls, '/tmp/raw/sales : ACL were not applied') self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/sales : pii acl was not applied') self.assertTrue('group:sales:r--' in sales_dir_acls, '/tmp/raw/sales : salse acl was not applied') self.assertIsNotNone(users_dir_acls, '/tmp/raw/users : ACL were not applied') self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/users : pii acl was not applied') finally: _test_basedir = HDFS('/tmp/raw') _test_basedir.delete_directory() self.assertFalse(_test_basedir.exists(), "ERROR: clean up failed")
def test_merge(self): basedir = os.path.dirname(os.path.realpath(__file__)) local = LocalFS(os.path.join(basedir, "resources", "test_merge")) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt")) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") hdfs_file.merge(merged_file.path) self.assertTrue(merged_file.exists(), "merged file was not copied to local fs") finally: hdfs_file.delete_directory() merged_file.delete()
def test_get_modification_time(self): now = datetime.now().strftime("%Y-%m-%d") _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) _file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: _dir.create_directory() _file.create_file() self.assertTrue(_dir.exists(), "Dir was not created") self.assertTrue(_file.exists(), "File was not created") self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time") self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time") finally: _dir.delete_directory() _file.delete()
def test_move_non_empty_dir(self): dst = HDFS("/tmp/dst_" + str(uuid.uuid4())) _dir = None try: _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(dst.exists(), "dst directory is already exists") _dir.move(dst.path) self.assertFalse(_dir.exists(), "original directory should be deleted") self.assertTrue(dst.exists(), "directory move operation failed") finally: if _dir: _dir.delete_directory() self.assertFalse(_dir.exists(), "Folder was not deleted") dst.delete_directory() self.assertFalse(dst.exists(), "Dst Folder was not deleted")
def test_streaming_job_without_reducer(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path, map_only_job=True) command_result = job.run() command_result.if_failed_raise(AssertionError("Cannot run map-only job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(2, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(11, _job_status.counter(group='Map-Reduce Framework', counter='Map input records')) self.assertEqual(3252, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _job_basedir.delete_directory()
def test_streaming_job(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Spilled Records'), "counters['Map-Reduce Framework']['Spilled Records']") self.assertEqual(143, _job_status.counter(group='Map-Reduce Framework', counter='Reduce output records'), "counters['Map-Reduce Framework']['Reduce output records']") self.assertEqual(370, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory()
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _base_dir = HDFS(os.path.join("/tmp", _job_name)) _base_dir.create_directory() try: jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar') # configure job inputs _job_input = HDFS(os.path.join(_base_dir.path, "input")) _job_input.create_directory() LocalFS(os.path.join( os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) # configure job output _job_output = HDFS(os.path.join(_base_dir.path, "output")) if not os.path.exists(jar): self.skipTest("'%s' not found" % jar) job = MapReduce.prepare_mapreduce_job(jar=jar, main_class="wordcount", name=_job_name) \ .with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments( _job_input.path, _job_output.path ) _command_submission_result = job.run() _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed") self.assertTrue(_job_output.exists(), "Error: empty job output") # check counters self.assertEqual(6, _job_status.counter(group='File System Counters', counter='HDFS: Number of write operations')) self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks')) self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _base_dir.delete_directory()
def test_streaming_job_with_multiple_inputs(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2")) _additional_datasource.create_directory() LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _additional_datasource.path) job.take(_additional_datasource.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory()