def kill(self, signal=SIGKILL): """ Kills the given processes. """ if self.container_id is None: pid = self.__get_pid() assert pid is not None, "No processes for %s" % self LOG.info('Killing %s with signal %s' % (self, signal)) exec_process("kill -%d %d" % (signal, pid)) else: LOG.info("Stopping container: {0}".format(self.container_id)) check_call(["docker", "container", "stop", self.container_id])
def kill(self): """ Kills the given processes. Returns the PID that was killed or None of no PID was found (process not running) """ pid = self.get_pid() if pid is None: assert 0, "No processes %s found" % self.cmd LOG.info('Killing: %s (PID: %d)' % (' '.join(self.cmd), pid)) exec_process("kill -9 %d" % pid) return pid
def kill(self, signal=SIGKILL): """ Kills the given processes. Returns the PID that was killed or None of no PID was found (process not running) """ pid = self.get_pid() if pid is None: assert 0, "No processes %s found" % self.cmd LOG.info('Killing: %s (PID: %d) with signal %s' % (' '.join(self.cmd), pid, signal)) exec_process("kill -%d %d" % (signal, pid)) return pid
def change_cache_directive_repl_for_path(path, repl): """Drop the cache directive for a given path""" dirid = get_cache_directive_for_path(path) rc, stdout, stderr = exec_process( "hdfs cacheadmin -modifyDirective -id %s -replication %s" % (dirid, repl)) assert rc == 0, \ "Error modifying cache directive for path %s (%s, %s)" % (path, stdout, stderr)
def test_hdfs_open_timeout(self, vector): """This verifies that hdfsOpenFile times out appropriately. It tests this by halting the NameNode, running a query that needs to do hdfsOpenFile, and verifying that it times out and throws an error.""" # Find the NameNode's pid via pgrep. This would raise an error if it did not # find a pid, so there is at least one match. rc, pgrep_output, stderr = exec_process("pgrep -f namenode.NameNode") assert rc == 0, \ "Error finding NameNode pid\nstdout={0}\nstderr={1}".format(pgrep_output, stderr) # In our test environment, this should only match one pid assert(pgrep_output.count("\n") == 1) namenode_pid = pgrep_output.strip() # Run a query successfully. This fetches metadata from the NameNode, # and since this will be cached, a subsequent run will not ask the NameNode # for metadata. This means a subsequent execution will only talk to the NameNode # for file open. self.execute_query_expect_success(self.client, "select count(*) from functional.alltypes", vector=vector) # Stop the NameNode and execute the query again. Since the file handle cache is off, # the query will do hdfsOpenFile calls and talk to the NameNode. Since the NameNode # is stopped, those calls will hang, testing the timeout functionality. ex = None result = None try: # Stop the NameNode check_call(["kill", "-STOP", namenode_pid]) start_time = time.time() result = self.execute_query("select count(*) from functional.alltypes", vector=vector) end_time = time.time() except Exception, e: ex = e
def is_path_fully_cached(path): """Returns true if all the bytes of the path are cached, false otherwise""" rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats -path %s" % path) assert rc == 0 caching_stats = stdout.strip("\n").split("\n")[-1].split() # Compare BYTES_NEEDED and BYTES_CACHED, the output format is as follows # "ID POOL REPL EXPIRY PATH BYTES_NEEDED BYTES_CACHED FILES_NEEDED FILES_CACHED" return len(caching_stats) > 0 and caching_stats[5] == caching_stats[6]
def test_hive_bulk_partition(self, vector): """Regression test for IMPALA-597. Verifies Impala is able to properly read tables that were altered using Hive's bulk partition statements that result in multiple partitions pointing to the same location. TODO: Once IMPALA-624 is resolved re-write this test using Impala instead of Hive. """ self.client.execute("use %s" % self.TEST_DB) location = '/test-warehouse/hive_bulk_part' # Cleanup any existing data in the table directory. self.hdfs_client.delete_file_dir(location[1:], recursive=True) # Create the table self.client.execute("create table hive_bulk_part(i int) partitioned by(j int)"\ "location '%s'" % location) # Point multiple partitions to the same location and use partition locations that # do not contain a key=value path. self.hdfs_client.make_dir(location[1:] + '/p') hive_cmd = "use %s; alter table hive_bulk_part add partition (j=1) location '%s/p'"\ " partition(j=2) location '%s/p'" % (self.TEST_DB, location, location) print "Executing: %s" % hive_cmd rc, stdout, stderr = exec_process("hive -e \"%s\"" % hive_cmd) assert rc == 0, stdout + '\n' + stderr # Insert some data. hive_cmd = "insert into table %s.hive_bulk_part partition(j=1) select 1 from "\ "functional.alltypes limit 1" % self.TEST_DB print "Executing: %s" % hive_cmd rc, stdout, stderr = exec_process("hive -e \"%s\"" % hive_cmd) assert rc == 0, stdout + '\n' + stderr # Reload the table metadata and ensure Impala detects this properly. self.client.execute("invalidate metadata hive_bulk_part") # The data will be read twice because each partition points to the same location. data = self.execute_scalar("select sum(i), sum(j) from hive_bulk_part") assert data.split('\t') == ['2', '3'] self.client.execute("insert into hive_bulk_part partition(j) select 1, 1") self.client.execute("insert into hive_bulk_part partition(j) select 1, 2") data = self.execute_scalar("select sum(i), sum(j) from hive_bulk_part") try: assert data.split('\t') == ['6', '6'] except AssertionError: pytest.xfail('IMPALA 624: Impala does not use a partition location for INSERT')
def test_load_data(self, vector): key_tbl_dir = vector.get_value('key_tbl_dir') key_load_dir = vector.get_value('key_load_dir') if vector.get_value('partitioned'): src_file = "/test-warehouse/alltypes/year=2010/month=1/100101.txt" src_tbl_schema = "functional.alltypes" else: src_file = "/test-warehouse/tinytable/data.csv" src_tbl_schema = "functional.tinytable" if key_load_dir is not None: rc, stdout, stderr = exec_process( 'hdfs crypto -createZone -keyName %s -path %s' % (key_load_dir, TMP_DIR)) assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout, stderr) # hdfs_client doesn't support copy rc, stdout, stderr = exec_process('hdfs dfs -cp %s %s' % (src_file, TMP_DIR)) assert rc == 0, 'Error executing hdfs cp: %s %s' % (stdout, stderr) self.client.execute('create table tbl like %s' % (src_tbl_schema)) if key_tbl_dir is not None: rc, stdout, stderr = exec_process( 'hdfs crypto -createZone -keyName %s -path /test-warehouse/%s.db/tbl' % (key_tbl_dir, TEST_DB)) assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout, stderr) if vector.get_value('partitioned'): # insert a single value to create the partition spec self.client.execute('insert into tbl partition (year=2010, month=1) '\ 'values (0,true,0,0,0,0,0,0,NULL,NULL,NULL)') self.client.execute('load data inpath \'%s\' into table tbl '\ 'partition(year=2010, month=1)' % (TMP_DIR)) else: self.client.execute('load data inpath \'%s\' into table tbl ' % (TMP_DIR))
def setup_method(self, method): self.__cleanup() self.client.execute('create database if not exists %s' % TEST_DB) self.client.execute('use %s' % TEST_DB) self.hdfs_client.make_dir(PYWEBHDFS_TMP_DIR) # Few tests depend on the .Trash directory being present. In case it doesn't # exist, we create a random text file and delete it so that hdfs recreates # the hierarchy of trash if not self.hdfs_client.exists("/user/{0}/.Trash/".format(getpass.getuser())): self.hdfs_client.create_file("test-warehouse/random",file_data="random") rc, stdout, stderr = exec_process("hadoop fs -rm /test-warehouse/random") assert rc == 0, 'Error re-creating trash: %s %s' % (stdout, stderr)
def test_drop_table_encrypt(self): """Verifies if drop <table> purge works in a case where Trash directory and table directory in different encryption zones""" self.client.execute("create table {0}.t3(i int)".format(TEST_DB)) # Clean up the trash directory to create an encrypted zone rc, stdout, stderr = exec_process( "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser())) assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) # Create table directory and trash directory in different encryption zones self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t3".format(TEST_DB)) self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(getpass.getuser())) self.client.execute("drop table {0}.t3 purge".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t3".format(TEST_DB))
def run_query_capture_results(cmd, query, exit_on_error): """ Runs the given query command and returns the execution result. Takes in a match function that is used to parse stderr/stdout to extract the results. """ exec_result = HiveQueryResult(query) start_time = datetime.now() try: rc, stdout, stderr = exec_process(cmd) except Exception, e: LOG.error('Error while executing query command: %s' % e) exec_result.query_error = str(e) # TODO: Should probably save the start time and query string for failed queries. return exec_result
def test_drop_partition_encrypt(self): """Verifies if alter <tbl> drop partition purge works in case where the Trash dir and partition dir are in different encryption zones. Check CDH-31350 for details""" self.client.execute("create table {0}.t1(i int) partitioned\ by (j int)".format(TEST_DB)) # Add three partitions (j=1), (j=2), (j=3) to table t1 self.client.execute("alter table {0}.t1 add partition(j=1)".format(TEST_DB)); self.client.execute("alter table {0}.t1 add partition(j=2)".format(TEST_DB)); self.client.execute("alter table {0}.t1 add partition(j=3)".format(TEST_DB)); # Clean up the trash directory to create an encrypted zone rc, stdout, stderr = exec_process( "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser())) assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) # Create the necessary encryption zones self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\ .format(TEST_DB)) self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\ .format(TEST_DB)) self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\ .format(TEST_DB)) self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\ getpass.getuser())) # Load sample data into the partition directories self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\ .format(TEST_DB), file_data='j1') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\ .format(TEST_DB), file_data='j2') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\ .format(TEST_DB), file_data='j3') # Drop the partition (j=1) without purge and make sure partition directory still # exists. This behavior is expected due to the difference in encryption zones self.execute_query_expect_failure(self.client, "alter table {0}.t1 drop \ partition(j=1)".format(TEST_DB)); assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) # Drop the partition j=2 (with purge) and make sure the partition directory is deleted self.client.execute("alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".format(TEST_DB)) # Drop the partition j=3 (with purge) and make sure the partition is deleted # This is the case where the trash directory and partition data directory # are in different encryption zones. Using purge should delete the partition # data pemanently by skipping trash self.client.execute("alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
def test_drop_partition_encrypt(self): """Verifies if alter <tbl> drop partition purge works in case where the Trash dir and partition dir are in different encryption zones. Check IMPALA-2310 for details""" self.client.execute("create table {0}.t1(i int) partitioned\ by (j int)".format(TEST_DB)) # Add three partitions (j=1), (j=2), (j=3) to table t1 self.client.execute("alter table {0}.t1 add partition(j=1)".format(TEST_DB)); self.client.execute("alter table {0}.t1 add partition(j=2)".format(TEST_DB)); self.client.execute("alter table {0}.t1 add partition(j=3)".format(TEST_DB)); # Clean up the trash directory to create an encrypted zone rc, stdout, stderr = exec_process( "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser())) assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) # Create the necessary encryption zones self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\ .format(TEST_DB)) self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\ .format(TEST_DB)) self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\ .format(TEST_DB)) self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\ getpass.getuser())) # Load sample data into the partition directories self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\ .format(TEST_DB), file_data='j1') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\ .format(TEST_DB), file_data='j2') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\ .format(TEST_DB), file_data='j3') # Drop the partition (j=1) without purge and make sure partition directory still # exists. This behavior is expected due to the difference in encryption zones self.execute_query_expect_failure(self.client, "alter table {0}.t1 drop \ partition(j=1)".format(TEST_DB)); assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) # Drop the partition j=2 (with purge) and make sure the partition directory is deleted self.client.execute("alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".format(TEST_DB)) # Drop the partition j=3 (with purge) and make sure the partition is deleted # This is the case where the trash directory and partition data directory # are in different encryption zones. Using purge should delete the partition # data pemanently by skipping trash self.client.execute("alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
def get_num_cache_requests_util(): rc, stdout, stderr = exec_process( "hdfs cacheadmin -listDirectives -stats") assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr) # remove blank new lines from output count lines = [line for line in stdout.split('\n') if line.strip()] count = None for line in lines: if line.startswith("Found "): # the line should say "Found <int> entries" # if we find this line we parse the number of entries # from this line. count = int(re.search(r'\d+', line).group()) break # if count is available we return it else we just # return the total number of lines if count is not None: return count else: return len(stdout.split('\n'))
def test_hdfs_open_timeout(self, vector): """This verifies that hdfsOpenFile times out appropriately. It tests this by halting the NameNode, running a query that needs to do hdfsOpenFile, and verifying that it times out and throws an error.""" # Find the NameNode's pid via pgrep. This would raise an error if it did not # find a pid, so there is at least one match. rc, pgrep_output, stderr = exec_process("pgrep -f namenode.NameNode") assert rc == 0, \ "Error finding NameNode pid\nstdout={0}\nstderr={1}".format(pgrep_output, stderr) # In our test environment, this should only match one pid assert (pgrep_output.count("\n") == 1) namenode_pid = pgrep_output.strip() # Run a query successfully. This fetches metadata from the NameNode, # and since this will be cached, a subsequent run will not ask the NameNode # for metadata. This means a subsequent execution will only talk to the NameNode # for file open. self.execute_query_expect_success( self.client, "select count(*) from functional.alltypes", vector=vector) # Stop the NameNode and execute the query again. Since the file handle cache is off, # the query will do hdfsOpenFile calls and talk to the NameNode. Since the NameNode # is stopped, those calls will hang, testing the timeout functionality. ex = None result = None try: # Stop the NameNode check_call(["kill", "-STOP", namenode_pid]) start_time = time.time() result = self.execute_query( "select count(*) from functional.alltypes", vector=vector) end_time = time.time() except Exception, e: ex = e
def get_num_cache_requests_util(): rc, stdout, stderr = exec_process( "hdfs cacheadmin -listDirectives -stats") assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr) return len(stdout.split('\n'))
def create_encryption_zone(self, key, path): """Creates an encryption zone using key 'key' on path 'path'""" rc, stdout, stderr = exec_process( "hdfs crypto -createZone -keyName %s -path %s" % (key, path)) assert rc == 0, 'Error creating encryption zone: %s %s' % (stdout, stderr)
def teardown_method(self, method): self.__cleanup() # Clean up trash directory so that further tests aren't affected rc, stdout, stderr = exec_process( "hadoop fs -rmr /user/{0}/.Trash/".format(getpass.getuser())) assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
def drop_cache_directives_for_path(path): """Drop the cache directive for a given path""" rc, stdout, stderr = exec_process("hdfs cacheadmin -removeDirectives -path %s" % path) assert rc == 0, \ "Error removing cache directive for path %s (%s, %s)" % (path, stdout, stderr)
def get_num_cache_requests(): """Returns the number of outstanding cache requests""" rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats") assert rc == 0, "Error executing hdfs cacheadmin: %s %s" % (stdout, stderr) return len(stdout.split("\n"))
def get_cache_directive_for_path(path): rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -path %s" % path) assert rc == 0 dirid = re.search("^\s+?(\d+)\s+?testPool\s+?.*?$", stdout, re.MULTILINE).group(1) return dirid
def drop_cache_directives_for_path(path): """Drop the cache directive for a given path""" rc, stdout, stderr = exec_process("hdfs cacheadmin -removeDirectives -path %s" % path) assert rc == 0, "Error removing cache directive for path %s (%s, %s)" % (path, stdout, stderr)
def test_drop_partition_encrypt(self): """Verifies if alter <tbl> drop partition purge works in case where the Trash dir and partition dir are in different encryption zones. Check IMPALA-2310 for details""" self.client.execute("create table {0}.t1(i int) partitioned\ by (j int)".format(TEST_DB)) # Add three partitions (j=1), (j=2), (j=3) to table t1 self.client.execute("alter table {0}.t1 add partition(j=1)".format(TEST_DB)); self.client.execute("alter table {0}.t1 add partition(j=2)".format(TEST_DB)); self.client.execute("alter table {0}.t1 add partition(j=3)".format(TEST_DB)); # Clean up the trash directory to create an encrypted zone rc, stdout, stderr = exec_process( "hadoop fs -rm -r /user/{0}/.Trash/*".format(getpass.getuser())) assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) # Create the necessary encryption zones self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\ .format(TEST_DB)) self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\ .format(TEST_DB)) self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\ .format(TEST_DB)) # HDFS 2.8+ behavior is to create individual trash per encryption zone; # don't create an encryption zone on .Trash in that case, otherwise # recursive trash is created. has_own_trash = self.hdfs_client.exists( "/test-warehouse/{0}.db/t1/j=1/.Trash".format(TEST_DB)) if not has_own_trash: self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\ getpass.getuser())) # Load sample data into the partition directories self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\ .format(TEST_DB), file_data='j1') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\ .format(TEST_DB), file_data='j2') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\ .format(TEST_DB), file_data='j3') # Drop the partition (j=1) without purge and make sure partition directory still # exists. This behavior is expected due to the difference in encryption zones # between the .Trash and the warehouse directory (prior to HDFS 2.8) if not has_own_trash: self.execute_query_expect_failure(self.client, "alter table {0}.t1 drop \ partition(j=1)".format(TEST_DB)); assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) else: # HDFS 2.8+ behavior succeeds the query and creates trash; the partition removal # ends up destroying the directories which moves this back to the user's trash self.client.execute("alter table {0}.t1 drop partition(j=1)".format(TEST_DB)); assert self.hdfs_client.exists( "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt"\ .format(getpass.getuser(), TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) # Drop the partition j=2 (with purge) and make sure the partition directory is deleted self.client.execute("alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".format(TEST_DB)) # Drop the partition j=3 (with purge) and make sure the partition is deleted # This is the case where the trash directory and partition data directory # are in different encryption zones. Using purge should delete the partition # data pemanently by skipping trash self.client.execute("alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB)) assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
def test_drop_partition_encrypt(self): """Verifies if alter <tbl> drop partition purge works in case where the Trash dir and partition dir are in different encryption zones. Check IMPALA-2310 for details""" self.client.execute("create table {0}.t1(i int) partitioned\ by (j int)".format(TEST_DB)) # Add three partitions (j=1), (j=2), (j=3) to table t1 self.client.execute( "alter table {0}.t1 add partition(j=1)".format(TEST_DB)) self.client.execute( "alter table {0}.t1 add partition(j=2)".format(TEST_DB)) self.client.execute( "alter table {0}.t1 add partition(j=3)".format(TEST_DB)) # Clean up the trash directory to create an encrypted zone rc, stdout, stderr = exec_process( "hadoop fs -rm -r /user/{0}/.Trash/*".format(getpass.getuser())) assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) # Create the necessary encryption zones self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\ .format(TEST_DB)) self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\ .format(TEST_DB)) self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\ .format(TEST_DB)) # HDFS 2.8+ behavior is to create individual trash per encryption zone; # don't create an encryption zone on .Trash in that case, otherwise # recursive trash is created. has_own_trash = self.hdfs_client.exists( "/test-warehouse/{0}.db/t1/j=1/.Trash".format(TEST_DB)) if not has_own_trash: self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\ getpass.getuser())) # Load sample data into the partition directories self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\ .format(TEST_DB), file_data='j1') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\ .format(TEST_DB), file_data='j2') self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\ .format(TEST_DB), file_data='j3') # Drop the partition (j=1) without purge and make sure partition directory still # exists. This behavior is expected due to the difference in encryption zones # between the .Trash and the warehouse directory (prior to HDFS 2.8) if not has_own_trash: self.execute_query_expect_failure( self.client, "alter table {0}.t1 drop \ partition(j=1)".format(TEST_DB)) assert self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) assert self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) else: # HDFS 2.8+ behavior succeeds the query and creates trash; the partition removal # ends up destroying the directories which moves this back to the user's trash self.client.execute( "alter table {0}.t1 drop partition(j=1)".format(TEST_DB)) assert self.hdfs_client.exists( "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt"\ .format(getpass.getuser(), TEST_DB)) assert not self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) assert not self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) # Drop the partition j=2 (with purge) and make sure the partition directory is deleted self.client.execute( "alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB)) assert not self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB)) assert not self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=2".format(TEST_DB)) # Drop the partition j=3 (with purge) and make sure the partition is deleted # This is the case where the trash directory and partition data directory # are in different encryption zones. Using purge should delete the partition # data pemanently by skipping trash self.client.execute( "alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB)) assert not self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB)) assert not self.hdfs_client.exists( "test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
def get_num_cache_requests_util(): rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats") assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr) return len(stdout.split('\n'))
def get_cache_directive_for_path(path): rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -path %s" % path) assert rc == 0 dirid = re.search('^\s+?(\d+)\s+?testPool\s+?.*?$', stdout, re.MULTILINE).group(1) return dirid
def get_num_cache_requests(): """Returns the number of outstanding cache requests""" rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats") assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr) return len(stdout.split('\n'))