def kill(self, signal=SIGKILL):
   """
   Kills the given processes.
   """
   if self.container_id is None:
     pid = self.__get_pid()
     assert pid is not None, "No processes for %s" % self
     LOG.info('Killing %s with signal %s' % (self, signal))
     exec_process("kill -%d %d" % (signal, pid))
   else:
     LOG.info("Stopping container: {0}".format(self.container_id))
     check_call(["docker", "container", "stop", self.container_id])
Beispiel #2
0
  def kill(self):
    """
    Kills the given processes.

    Returns the PID that was killed or None of no PID was found (process not running)
    """
    pid = self.get_pid()
    if pid is None:
      assert 0, "No processes %s found" % self.cmd
    LOG.info('Killing: %s (PID: %d)'  % (' '.join(self.cmd), pid))
    exec_process("kill -9 %d" % pid)
    return pid
  def kill(self, signal=SIGKILL):
    """
    Kills the given processes.

    Returns the PID that was killed or None of no PID was found (process not running)
    """
    pid = self.get_pid()
    if pid is None:
      assert 0, "No processes %s found" % self.cmd
    LOG.info('Killing: %s (PID: %d) with signal %s'  % (' '.join(self.cmd), pid, signal))
    exec_process("kill -%d %d" % (signal, pid))
    return pid
def change_cache_directive_repl_for_path(path, repl):
  """Drop the cache directive for a given path"""
  dirid = get_cache_directive_for_path(path)
  rc, stdout, stderr = exec_process(
    "hdfs cacheadmin -modifyDirective -id %s -replication %s" % (dirid, repl))
  assert rc == 0, \
      "Error modifying cache directive for path %s (%s, %s)" % (path, stdout, stderr)
  def test_hdfs_open_timeout(self, vector):
    """This verifies that hdfsOpenFile times out appropriately. It tests this by
       halting the NameNode, running a query that needs to do hdfsOpenFile,
       and verifying that it times out and throws an error."""

    # Find the NameNode's pid via pgrep. This would raise an error if it did not
    # find a pid, so there is at least one match.
    rc, pgrep_output, stderr = exec_process("pgrep -f namenode.NameNode")
    assert rc == 0, \
        "Error finding NameNode pid\nstdout={0}\nstderr={1}".format(pgrep_output, stderr)
    # In our test environment, this should only match one pid
    assert(pgrep_output.count("\n") == 1)
    namenode_pid = pgrep_output.strip()

    # Run a query successfully. This fetches metadata from the NameNode,
    # and since this will be cached, a subsequent run will not ask the NameNode
    # for metadata. This means a subsequent execution will only talk to the NameNode
    # for file open.
    self.execute_query_expect_success(self.client,
        "select count(*) from functional.alltypes", vector=vector)

    # Stop the NameNode and execute the query again. Since the file handle cache is off,
    # the query will do hdfsOpenFile calls and talk to the NameNode. Since the NameNode
    # is stopped, those calls will hang, testing the timeout functionality.
    ex = None
    result = None
    try:
      # Stop the NameNode
      check_call(["kill", "-STOP", namenode_pid])
      start_time = time.time()
      result = self.execute_query("select count(*) from functional.alltypes",
          vector=vector)
      end_time = time.time()
    except Exception, e:
      ex = e
def is_path_fully_cached(path):
  """Returns true if all the bytes of the path are cached, false otherwise"""
  rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats -path %s" % path)
  assert rc == 0
  caching_stats = stdout.strip("\n").split("\n")[-1].split()
  # Compare BYTES_NEEDED and BYTES_CACHED, the output format is as follows
  # "ID POOL REPL EXPIRY PATH BYTES_NEEDED BYTES_CACHED FILES_NEEDED FILES_CACHED"
  return len(caching_stats) > 0 and caching_stats[5] == caching_stats[6]
Beispiel #7
0
def change_cache_directive_repl_for_path(path, repl):
    """Drop the cache directive for a given path"""
    dirid = get_cache_directive_for_path(path)
    rc, stdout, stderr = exec_process(
        "hdfs cacheadmin -modifyDirective -id %s -replication %s" %
        (dirid, repl))
    assert rc == 0, \
        "Error modifying cache directive for path %s (%s, %s)" % (path, stdout, stderr)
Beispiel #8
0
def is_path_fully_cached(path):
  """Returns true if all the bytes of the path are cached, false otherwise"""
  rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats -path %s" % path)
  assert rc == 0
  caching_stats = stdout.strip("\n").split("\n")[-1].split()
  # Compare BYTES_NEEDED and BYTES_CACHED, the output format is as follows
  # "ID POOL REPL EXPIRY PATH BYTES_NEEDED BYTES_CACHED FILES_NEEDED FILES_CACHED"
  return len(caching_stats) > 0 and caching_stats[5] == caching_stats[6]
Beispiel #9
0
  def test_hive_bulk_partition(self, vector):
    """Regression test for IMPALA-597. Verifies Impala is able to properly read
    tables that were altered using Hive's bulk partition statements that result
    in multiple partitions pointing to the same location.
    TODO: Once IMPALA-624 is resolved re-write this test using Impala instead of Hive.
    """
    self.client.execute("use %s" % self.TEST_DB)
    location = '/test-warehouse/hive_bulk_part'
    # Cleanup any existing data in the table directory.
    self.hdfs_client.delete_file_dir(location[1:], recursive=True)
    # Create the table
    self.client.execute("create table hive_bulk_part(i int) partitioned by(j int)"\
        "location '%s'" % location)

    # Point multiple partitions to the same location and use partition locations that
    # do not contain a key=value path.
    self.hdfs_client.make_dir(location[1:] + '/p')

    hive_cmd = "use %s; alter table hive_bulk_part add partition (j=1) location '%s/p'"\
        " partition(j=2) location '%s/p'" % (self.TEST_DB, location, location)
    print "Executing: %s" % hive_cmd
    rc, stdout, stderr = exec_process("hive -e \"%s\"" % hive_cmd)
    assert rc == 0, stdout + '\n' + stderr

    # Insert some data.
    hive_cmd = "insert into table %s.hive_bulk_part partition(j=1) select 1 from "\
               "functional.alltypes limit 1" % self.TEST_DB
    print "Executing: %s" % hive_cmd
    rc, stdout, stderr = exec_process("hive -e \"%s\"" % hive_cmd)
    assert rc == 0, stdout + '\n' + stderr

    # Reload the table metadata and ensure Impala detects this properly.
    self.client.execute("invalidate metadata hive_bulk_part")

    # The data will be read twice because each partition points to the same location.
    data = self.execute_scalar("select sum(i), sum(j) from hive_bulk_part")
    assert data.split('\t') == ['2', '3']

    self.client.execute("insert into hive_bulk_part partition(j) select 1, 1")
    self.client.execute("insert into hive_bulk_part partition(j) select 1, 2")
    data = self.execute_scalar("select sum(i), sum(j) from hive_bulk_part")
    try:
      assert data.split('\t') == ['6', '6']
    except AssertionError:
      pytest.xfail('IMPALA 624: Impala does not use a partition location for INSERT')
    def test_load_data(self, vector):
        key_tbl_dir = vector.get_value('key_tbl_dir')
        key_load_dir = vector.get_value('key_load_dir')

        if vector.get_value('partitioned'):
            src_file = "/test-warehouse/alltypes/year=2010/month=1/100101.txt"
            src_tbl_schema = "functional.alltypes"
        else:
            src_file = "/test-warehouse/tinytable/data.csv"
            src_tbl_schema = "functional.tinytable"

        if key_load_dir is not None:
            rc, stdout, stderr = exec_process(
                'hdfs crypto -createZone -keyName %s -path %s' %
                (key_load_dir, TMP_DIR))
            assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout,
                                                                    stderr)

        # hdfs_client doesn't support copy
        rc, stdout, stderr = exec_process('hdfs dfs -cp %s %s' %
                                          (src_file, TMP_DIR))
        assert rc == 0, 'Error executing hdfs cp: %s %s' % (stdout, stderr)

        self.client.execute('create table tbl like %s' % (src_tbl_schema))

        if key_tbl_dir is not None:
            rc, stdout, stderr = exec_process(
                'hdfs crypto -createZone -keyName %s -path /test-warehouse/%s.db/tbl'
                % (key_tbl_dir, TEST_DB))
            assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout,
                                                                    stderr)

        if vector.get_value('partitioned'):
            # insert a single value to create the partition spec
            self.client.execute('insert into tbl partition (year=2010, month=1) '\
                'values (0,true,0,0,0,0,0,0,NULL,NULL,NULL)')
            self.client.execute('load data inpath \'%s\' into table tbl '\
                'partition(year=2010, month=1)' % (TMP_DIR))
        else:
            self.client.execute('load data inpath \'%s\' into table tbl ' %
                                (TMP_DIR))
 def setup_method(self, method):
   self.__cleanup()
   self.client.execute('create database if not exists %s' % TEST_DB)
   self.client.execute('use %s' % TEST_DB)
   self.hdfs_client.make_dir(PYWEBHDFS_TMP_DIR)
   # Few tests depend on the .Trash directory being present. In case it doesn't
   # exist, we create a random text file and delete it so that hdfs recreates
   # the hierarchy of trash
   if not self.hdfs_client.exists("/user/{0}/.Trash/".format(getpass.getuser())):
     self.hdfs_client.create_file("test-warehouse/random",file_data="random")
     rc, stdout, stderr = exec_process("hadoop fs -rm /test-warehouse/random")
     assert rc == 0, 'Error re-creating trash: %s %s' % (stdout, stderr)
 def setup_method(self, method):
   self.__cleanup()
   self.client.execute('create database if not exists %s' % TEST_DB)
   self.client.execute('use %s' % TEST_DB)
   self.hdfs_client.make_dir(PYWEBHDFS_TMP_DIR)
   # Few tests depend on the .Trash directory being present. In case it doesn't
   # exist, we create a random text file and delete it so that hdfs recreates
   # the hierarchy of trash
   if not self.hdfs_client.exists("/user/{0}/.Trash/".format(getpass.getuser())):
     self.hdfs_client.create_file("test-warehouse/random",file_data="random")
     rc, stdout, stderr = exec_process("hadoop fs -rm /test-warehouse/random")
     assert rc == 0, 'Error re-creating trash: %s %s' % (stdout, stderr)
  def test_drop_table_encrypt(self):
    """Verifies if drop <table> purge works in a case where Trash directory and table
    directory in different encryption zones"""
    self.client.execute("create table {0}.t3(i int)".format(TEST_DB))

    # Clean up the trash directory to create an encrypted zone
    rc, stdout, stderr = exec_process(
            "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser()))
    assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
    # Create table directory and trash directory in different encryption zones
    self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t3".format(TEST_DB))
    self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(getpass.getuser()))
    self.client.execute("drop table {0}.t3 purge".format(TEST_DB))
    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t3".format(TEST_DB))
  def test_drop_table_encrypt(self):
    """Verifies if drop <table> purge works in a case where Trash directory and table
    directory in different encryption zones"""
    self.client.execute("create table {0}.t3(i int)".format(TEST_DB))

    # Clean up the trash directory to create an encrypted zone
    rc, stdout, stderr = exec_process(
            "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser()))
    assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
    # Create table directory and trash directory in different encryption zones
    self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t3".format(TEST_DB))
    self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(getpass.getuser()))
    self.client.execute("drop table {0}.t3 purge".format(TEST_DB))
    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t3".format(TEST_DB))
def run_query_capture_results(cmd, query, exit_on_error):
  """
  Runs the given query command and returns the execution result.

  Takes in a match function that is used to parse stderr/stdout to extract the results.
  """
  exec_result = HiveQueryResult(query)
  start_time = datetime.now()
  try:
    rc, stdout, stderr = exec_process(cmd)
  except Exception, e:
    LOG.error('Error while executing query command: %s' % e)
    exec_result.query_error = str(e)
    # TODO: Should probably save the start time and query string for failed queries.
    return exec_result
Beispiel #16
0
def run_query_capture_results(cmd, query, exit_on_error):
    """
  Runs the given query command and returns the execution result.

  Takes in a match function that is used to parse stderr/stdout to extract the results.
  """
    exec_result = HiveQueryResult(query)
    start_time = datetime.now()
    try:
        rc, stdout, stderr = exec_process(cmd)
    except Exception, e:
        LOG.error('Error while executing query command: %s' % e)
        exec_result.query_error = str(e)
        # TODO: Should probably save the start time and query string for failed queries.
        return exec_result
  def test_load_data(self, vector):
    key_tbl_dir = vector.get_value('key_tbl_dir')
    key_load_dir = vector.get_value('key_load_dir')

    if vector.get_value('partitioned'):
      src_file = "/test-warehouse/alltypes/year=2010/month=1/100101.txt"
      src_tbl_schema = "functional.alltypes"
    else:
      src_file = "/test-warehouse/tinytable/data.csv"
      src_tbl_schema = "functional.tinytable"

    if key_load_dir is not None:
      rc, stdout, stderr = exec_process(
          'hdfs crypto -createZone -keyName %s -path %s' % (key_load_dir, TMP_DIR))
      assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout, stderr)

    # hdfs_client doesn't support copy
    rc, stdout, stderr = exec_process('hdfs dfs -cp %s %s' % (src_file, TMP_DIR))
    assert rc == 0, 'Error executing hdfs cp: %s %s' % (stdout, stderr)

    self.client.execute('create table tbl like %s' % (src_tbl_schema))

    if key_tbl_dir is not None:
      rc, stdout, stderr = exec_process(
          'hdfs crypto -createZone -keyName %s -path /test-warehouse/%s.db/tbl' %
          (key_tbl_dir, TEST_DB))
      assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout, stderr)

    if vector.get_value('partitioned'):
      # insert a single value to create the partition spec
      self.client.execute('insert into tbl partition (year=2010, month=1) '\
          'values (0,true,0,0,0,0,0,0,NULL,NULL,NULL)')
      self.client.execute('load data inpath \'%s\' into table tbl '\
          'partition(year=2010, month=1)' % (TMP_DIR))
    else:
      self.client.execute('load data inpath \'%s\' into table tbl ' % (TMP_DIR))
 def test_drop_partition_encrypt(self):
   """Verifies if alter <tbl> drop partition purge works in case
   where the Trash dir and partition dir are in different encryption
   zones. Check CDH-31350 for details"""
   self.client.execute("create table {0}.t1(i int) partitioned\
     by (j int)".format(TEST_DB))
   # Add three partitions (j=1), (j=2), (j=3) to table t1
   self.client.execute("alter table {0}.t1 add partition(j=1)".format(TEST_DB));
   self.client.execute("alter table {0}.t1 add partition(j=2)".format(TEST_DB));
   self.client.execute("alter table {0}.t1 add partition(j=3)".format(TEST_DB));
   # Clean up the trash directory to create an encrypted zone
   rc, stdout, stderr = exec_process(
           "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser()))
   assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
   # Create the necessary encryption zones
   self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\
           .format(TEST_DB))
   self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\
           .format(TEST_DB))
   self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\
           .format(TEST_DB))
   self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\
           getpass.getuser()))
   # Load sample data into the partition directories
   self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\
           .format(TEST_DB), file_data='j1')
   self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\
           .format(TEST_DB), file_data='j2')
   self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\
           .format(TEST_DB), file_data='j3')
   # Drop the partition (j=1) without purge and make sure partition directory still
   # exists. This behavior is expected due to the difference in encryption zones
   self.execute_query_expect_failure(self.client, "alter table {0}.t1 drop \
           partition(j=1)".format(TEST_DB));
   assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB))
   assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB))
   # Drop the partition j=2 (with purge) and make sure the partition directory is deleted
   self.client.execute("alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".format(TEST_DB))
   # Drop the partition j=3 (with purge) and make sure the partition is deleted
   # This is the case where the trash directory and partition data directory
   # are in different encryption zones. Using purge should delete the partition
   # data pemanently by skipping trash
   self.client.execute("alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
 def test_drop_partition_encrypt(self):
   """Verifies if alter <tbl> drop partition purge works in case
   where the Trash dir and partition dir are in different encryption
   zones. Check IMPALA-2310 for details"""
   self.client.execute("create table {0}.t1(i int) partitioned\
     by (j int)".format(TEST_DB))
   # Add three partitions (j=1), (j=2), (j=3) to table t1
   self.client.execute("alter table {0}.t1 add partition(j=1)".format(TEST_DB));
   self.client.execute("alter table {0}.t1 add partition(j=2)".format(TEST_DB));
   self.client.execute("alter table {0}.t1 add partition(j=3)".format(TEST_DB));
   # Clean up the trash directory to create an encrypted zone
   rc, stdout, stderr = exec_process(
           "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser()))
   assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
   # Create the necessary encryption zones
   self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\
           .format(TEST_DB))
   self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\
           .format(TEST_DB))
   self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\
           .format(TEST_DB))
   self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\
           getpass.getuser()))
   # Load sample data into the partition directories
   self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\
           .format(TEST_DB), file_data='j1')
   self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\
           .format(TEST_DB), file_data='j2')
   self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\
           .format(TEST_DB), file_data='j3')
   # Drop the partition (j=1) without purge and make sure partition directory still
   # exists. This behavior is expected due to the difference in encryption zones
   self.execute_query_expect_failure(self.client, "alter table {0}.t1 drop \
           partition(j=1)".format(TEST_DB));
   assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB))
   assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB))
   # Drop the partition j=2 (with purge) and make sure the partition directory is deleted
   self.client.execute("alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".format(TEST_DB))
   # Drop the partition j=3 (with purge) and make sure the partition is deleted
   # This is the case where the trash directory and partition data directory
   # are in different encryption zones. Using purge should delete the partition
   # data pemanently by skipping trash
   self.client.execute("alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB))
   assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
Beispiel #20
0
 def get_num_cache_requests_util():
     rc, stdout, stderr = exec_process(
         "hdfs cacheadmin -listDirectives -stats")
     assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout,
                                                                 stderr)
     # remove blank new lines from output count
     lines = [line for line in stdout.split('\n') if line.strip()]
     count = None
     for line in lines:
         if line.startswith("Found "):
             # the line should say "Found <int> entries"
             # if we find this line we parse the number of entries
             # from this line.
             count = int(re.search(r'\d+', line).group())
             break
     # if count is available we return it else we just
     # return the total number of lines
     if count is not None:
         return count
     else:
         return len(stdout.split('\n'))
Beispiel #21
0
    def test_hdfs_open_timeout(self, vector):
        """This verifies that hdfsOpenFile times out appropriately. It tests this by
       halting the NameNode, running a query that needs to do hdfsOpenFile,
       and verifying that it times out and throws an error."""

        # Find the NameNode's pid via pgrep. This would raise an error if it did not
        # find a pid, so there is at least one match.
        rc, pgrep_output, stderr = exec_process("pgrep -f namenode.NameNode")
        assert rc == 0, \
            "Error finding NameNode pid\nstdout={0}\nstderr={1}".format(pgrep_output, stderr)
        # In our test environment, this should only match one pid
        assert (pgrep_output.count("\n") == 1)
        namenode_pid = pgrep_output.strip()

        # Run a query successfully. This fetches metadata from the NameNode,
        # and since this will be cached, a subsequent run will not ask the NameNode
        # for metadata. This means a subsequent execution will only talk to the NameNode
        # for file open.
        self.execute_query_expect_success(
            self.client,
            "select count(*) from functional.alltypes",
            vector=vector)

        # Stop the NameNode and execute the query again. Since the file handle cache is off,
        # the query will do hdfsOpenFile calls and talk to the NameNode. Since the NameNode
        # is stopped, those calls will hang, testing the timeout functionality.
        ex = None
        result = None
        try:
            # Stop the NameNode
            check_call(["kill", "-STOP", namenode_pid])
            start_time = time.time()
            result = self.execute_query(
                "select count(*) from functional.alltypes", vector=vector)
            end_time = time.time()
        except Exception, e:
            ex = e
Beispiel #22
0
 def get_num_cache_requests_util():
     rc, stdout, stderr = exec_process(
         "hdfs cacheadmin -listDirectives -stats")
     assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout,
                                                                 stderr)
     return len(stdout.split('\n'))
 def create_encryption_zone(self, key, path):
   """Creates an encryption zone using key 'key' on path 'path'"""
   rc, stdout, stderr = exec_process(
           "hdfs crypto -createZone -keyName %s -path %s" % (key, path))
   assert rc == 0, 'Error creating encryption zone: %s %s' % (stdout, stderr)
 def teardown_method(self, method):
   self.__cleanup()
   # Clean up trash directory so that further tests aren't affected
   rc, stdout, stderr = exec_process(
           "hadoop fs -rmr /user/{0}/.Trash/".format(getpass.getuser()))
   assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
Beispiel #25
0
def drop_cache_directives_for_path(path):
  """Drop the cache directive for a given path"""
  rc, stdout, stderr = exec_process("hdfs cacheadmin -removeDirectives -path %s" % path)
  assert rc == 0, \
      "Error removing cache directive for path %s (%s, %s)" % (path, stdout, stderr)
def get_num_cache_requests():
    """Returns the number of outstanding cache requests"""
    rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats")
    assert rc == 0, "Error executing hdfs cacheadmin: %s %s" % (stdout, stderr)
    return len(stdout.split("\n"))
def get_cache_directive_for_path(path):
    rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -path %s" % path)
    assert rc == 0
    dirid = re.search("^\s+?(\d+)\s+?testPool\s+?.*?$", stdout, re.MULTILINE).group(1)
    return dirid
def drop_cache_directives_for_path(path):
    """Drop the cache directive for a given path"""
    rc, stdout, stderr = exec_process("hdfs cacheadmin -removeDirectives -path %s" % path)
    assert rc == 0, "Error removing cache directive for path %s (%s, %s)" % (path, stdout, stderr)
  def test_drop_partition_encrypt(self):
    """Verifies if alter <tbl> drop partition purge works in case
    where the Trash dir and partition dir are in different encryption
    zones. Check IMPALA-2310 for details"""
    self.client.execute("create table {0}.t1(i int) partitioned\
      by (j int)".format(TEST_DB))
    # Add three partitions (j=1), (j=2), (j=3) to table t1
    self.client.execute("alter table {0}.t1 add partition(j=1)".format(TEST_DB));
    self.client.execute("alter table {0}.t1 add partition(j=2)".format(TEST_DB));
    self.client.execute("alter table {0}.t1 add partition(j=3)".format(TEST_DB));
    # Clean up the trash directory to create an encrypted zone
    rc, stdout, stderr = exec_process(
            "hadoop fs -rm -r /user/{0}/.Trash/*".format(getpass.getuser()))
    assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
    # Create the necessary encryption zones
    self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\
            .format(TEST_DB))
    self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\
            .format(TEST_DB))
    self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\
            .format(TEST_DB))

    # HDFS 2.8+ behavior is to create individual trash per encryption zone;
    # don't create an encryption zone on .Trash in that case, otherwise
    # recursive trash is created.
    has_own_trash = self.hdfs_client.exists(
        "/test-warehouse/{0}.db/t1/j=1/.Trash".format(TEST_DB))
    if not has_own_trash:
      self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\
              getpass.getuser()))

    # Load sample data into the partition directories
    self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\
            .format(TEST_DB), file_data='j1')
    self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\
            .format(TEST_DB), file_data='j2')
    self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\
            .format(TEST_DB), file_data='j3')

    # Drop the partition (j=1) without purge and make sure partition directory still
    # exists. This behavior is expected due to the difference in encryption zones
    # between the .Trash and the warehouse directory (prior to HDFS 2.8)
    if not has_own_trash:
      self.execute_query_expect_failure(self.client, "alter table {0}.t1 drop \
              partition(j=1)".format(TEST_DB));
      assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB))
      assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB))
    else:
      # HDFS 2.8+ behavior succeeds the query and creates trash; the partition removal
      # ends up destroying the directories which moves this back to the user's trash
      self.client.execute("alter table {0}.t1 drop partition(j=1)".format(TEST_DB));
      assert self.hdfs_client.exists(
        "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt"\
        .format(getpass.getuser(), TEST_DB))
      assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB))
      assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB))

    # Drop the partition j=2 (with purge) and make sure the partition directory is deleted
    self.client.execute("alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB))
    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB))
    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".format(TEST_DB))
    # Drop the partition j=3 (with purge) and make sure the partition is deleted
    # This is the case where the trash directory and partition data directory
    # are in different encryption zones. Using purge should delete the partition
    # data pemanently by skipping trash
    self.client.execute("alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB))
    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB))
    assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
 def teardown_method(self, method):
   self.__cleanup()
   # Clean up trash directory so that further tests aren't affected
   rc, stdout, stderr = exec_process(
           "hadoop fs -rmr /user/{0}/.Trash/".format(getpass.getuser()))
   assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
Beispiel #31
0
    def test_drop_partition_encrypt(self):
        """Verifies if alter <tbl> drop partition purge works in case
    where the Trash dir and partition dir are in different encryption
    zones. Check IMPALA-2310 for details"""
        self.client.execute("create table {0}.t1(i int) partitioned\
      by (j int)".format(TEST_DB))
        # Add three partitions (j=1), (j=2), (j=3) to table t1
        self.client.execute(
            "alter table {0}.t1 add partition(j=1)".format(TEST_DB))
        self.client.execute(
            "alter table {0}.t1 add partition(j=2)".format(TEST_DB))
        self.client.execute(
            "alter table {0}.t1 add partition(j=3)".format(TEST_DB))
        # Clean up the trash directory to create an encrypted zone
        rc, stdout, stderr = exec_process(
            "hadoop fs -rm -r /user/{0}/.Trash/*".format(getpass.getuser()))
        assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr)
        # Create the necessary encryption zones
        self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\
                .format(TEST_DB))
        self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\
                .format(TEST_DB))
        self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\
                .format(TEST_DB))

        # HDFS 2.8+ behavior is to create individual trash per encryption zone;
        # don't create an encryption zone on .Trash in that case, otherwise
        # recursive trash is created.
        has_own_trash = self.hdfs_client.exists(
            "/test-warehouse/{0}.db/t1/j=1/.Trash".format(TEST_DB))
        if not has_own_trash:
            self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\
                    getpass.getuser()))

        # Load sample data into the partition directories
        self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\
                .format(TEST_DB), file_data='j1')
        self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\
                .format(TEST_DB), file_data='j2')
        self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\
                .format(TEST_DB), file_data='j3')

        # Drop the partition (j=1) without purge and make sure partition directory still
        # exists. This behavior is expected due to the difference in encryption zones
        # between the .Trash and the warehouse directory (prior to HDFS 2.8)
        if not has_own_trash:
            self.execute_query_expect_failure(
                self.client, "alter table {0}.t1 drop \
              partition(j=1)".format(TEST_DB))
            assert self.hdfs_client.exists(
                "test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB))
            assert self.hdfs_client.exists(
                "test-warehouse/{0}.db/t1/j=1".format(TEST_DB))
        else:
            # HDFS 2.8+ behavior succeeds the query and creates trash; the partition removal
            # ends up destroying the directories which moves this back to the user's trash
            self.client.execute(
                "alter table {0}.t1 drop partition(j=1)".format(TEST_DB))
            assert self.hdfs_client.exists(
              "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt"\
              .format(getpass.getuser(), TEST_DB))
            assert not self.hdfs_client.exists(
                "test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB))
            assert not self.hdfs_client.exists(
                "test-warehouse/{0}.db/t1/j=1".format(TEST_DB))

        # Drop the partition j=2 (with purge) and make sure the partition directory is deleted
        self.client.execute(
            "alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB))
        assert not self.hdfs_client.exists(
            "test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB))
        assert not self.hdfs_client.exists(
            "test-warehouse/{0}.db/t1/j=2".format(TEST_DB))
        # Drop the partition j=3 (with purge) and make sure the partition is deleted
        # This is the case where the trash directory and partition data directory
        # are in different encryption zones. Using purge should delete the partition
        # data pemanently by skipping trash
        self.client.execute(
            "alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB))
        assert not self.hdfs_client.exists(
            "test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB))
        assert not self.hdfs_client.exists(
            "test-warehouse/{0}.db/t1/j=3".format(TEST_DB))
 def get_num_cache_requests_util():
   rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats")
   assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr)
   return len(stdout.split('\n'))
Beispiel #33
0
def get_cache_directive_for_path(path):
  rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -path %s" % path)
  assert rc == 0
  dirid = re.search('^\s+?(\d+)\s+?testPool\s+?.*?$', stdout, re.MULTILINE).group(1)
  return dirid
 def create_encryption_zone(self, key, path):
   """Creates an encryption zone using key 'key' on path 'path'"""
   rc, stdout, stderr = exec_process(
           "hdfs crypto -createZone -keyName %s -path %s" % (key, path))
   assert rc == 0, 'Error creating encryption zone: %s %s' % (stdout, stderr)
Beispiel #35
0
def get_num_cache_requests():
  """Returns the number of outstanding cache requests"""
  rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats")
  assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr)
  return len(stdout.split('\n'))