Example #1
0
    def test_def_level_encoding(self, vector, unique_database):
        """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a
    parquet table and running the parquet-reader tool on it, which performs sanity
    checking, such as that the correct number of definition levels were encoded.
    """
        table_name = "test_hdfs_parquet_table_writer"
        qualified_table_name = "%s.%s" % (unique_database, table_name)
        self.execute_query(
            "create table %s stored as parquet as select l_linenumber from "
            "tpch_parquet.lineitem limit 180000" % qualified_table_name)

        tmp_dir = make_tmp_dir()
        try:
            hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' %
                                    (unique_database, table_name))
            check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmp_dir])

            for root, subdirs, files in os.walk(tmp_dir):
                for f in files:
                    if not f.endswith('parq'):
                        continue
                    check_call([
                        os.path.join(impalad_basedir, 'util/parquet-reader'),
                        '--file',
                        os.path.join(tmp_dir, str(f))
                    ])
        finally:
            self.execute_query("drop table %s" % qualified_table_name)
            rmtree(tmp_dir)
Example #2
0
    def start_cluster_using_rules(self,
                                  redaction_rules,
                                  log_level=2,
                                  vmodule=""):
        '''Start Impala with a custom log dir and redaction rules.'''
        self.tmp_dir = make_tmp_dir()
        os.chmod(self.tmp_dir, 0o777)
        LOG.info("tmp_dir is " + self.tmp_dir)
        os.mkdir(self.log_dir)
        os.mkdir(self.audit_dir)
        os.mkdir(self.profile_dir)

        # Write the redaction rules as set in @using_redaction_rules.
        with open(self.rules_file, 'w') as file:
            file.write(redaction_rules)

        self._start_impala_cluster([
            """--impalad_args='-audit_event_log_dir=%s
                            -profile_log_dir=%s
                            -redaction_rules_file=%s
                            -vmodule=%s'""" %
            (self.audit_dir, self.profile_dir, self.rules_file, vmodule)
        ],
                                   log_dir=self.log_dir,
                                   log_level=log_level)
        self.client = self.create_impala_client()
  def test_def_level_encoding(self, vector, unique_database):
    """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a
    parquet table and running the parquet-reader tool on it, which performs sanity
    checking, such as that the correct number of definition levels were encoded.
    """
    table_name = "test_hdfs_parquet_table_writer"
    qualified_table_name = "%s.%s" % (unique_database, table_name)
    self.execute_query("drop table if exists %s" % qualified_table_name)
    self.execute_query("create table %s stored as parquet as select l_linenumber from "
        "tpch_parquet.lineitem limit 180000" % qualified_table_name)

    tmp_dir = make_tmp_dir()
    try:
      hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq'
          % (unique_database, table_name))
      check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmp_dir])

      for root, subdirs, files in os.walk(tmp_dir):
        for f in files:
          if not f.endswith('parq'):
            continue
          check_call([os.path.join(impalad_basedir, 'util/parquet-reader'), '--file',
              os.path.join(tmp_dir, str(f))])
    finally:
      self.execute_query("drop table %s" % qualified_table_name)
      rmtree(tmp_dir)
Example #4
0
  def _get_row_group_stats_from_hdfs_folder(self, hdfs_path):
    """Returns a list of statistics for each row group in all parquet files in
    'hdfs_path'. The result is a two-dimensional list, containing stats by row group and
    column."""
    row_group_stats = []

    try:
      tmp_dir = make_tmp_dir()
      check_call(['hdfs', 'dfs', '-get', hdfs_path, tmp_dir])

      for root, subdirs, files in os.walk(tmp_dir):
        for f in files:
          parquet_file = os.path.join(root, str(f))
          row_group_stats.extend(self._get_row_group_stats_from_file(parquet_file))

    finally:
      rmtree(tmp_dir)

    return row_group_stats
Example #5
0
  def _get_row_group_stats_from_hdfs_folder(self, hdfs_path):
    """Returns a list of statistics for each row group in all parquet files in
    'hdfs_path'. The result is a two-dimensional list, containing stats by row group and
    column."""
    row_group_stats = []

    try:
      tmp_dir = make_tmp_dir()
      check_call(['hdfs', 'dfs', '-get', hdfs_path, tmp_dir])

      for root, subdirs, files in os.walk(tmp_dir):
        for f in files:
          parquet_file = os.path.join(root, str(f))
          row_group_stats.extend(self._get_row_group_stats_from_file(parquet_file))

    finally:
      rmtree(tmp_dir)

    return row_group_stats
  def start_cluster_using_rules(self, redaction_rules, log_level=2, vmodule=""):
    '''Start Impala with a custom log dir and redaction rules.'''
    self.tmp_dir = make_tmp_dir()
    os.chmod(self.tmp_dir, 0o777)
    LOG.info("tmp_dir is " + self.tmp_dir)
    os.mkdir(self.log_dir)
    os.mkdir(self.audit_dir)
    os.mkdir(self.profile_dir)

    # Write the redaction rules as set in @using_redaction_rules.
    with open(self.rules_file, 'w') as file:
      file.write(redaction_rules)

    self._start_impala_cluster(
        ["""--impalad_args='-audit_event_log_dir=%s
                            -profile_log_dir=%s
                            -redaction_rules_file=%s
                            -vmodule=%s'"""
            % (self.audit_dir, self.profile_dir, self.rules_file, vmodule)],
        log_dir=self.log_dir,
        log_level=log_level)
    self.client = self.create_impala_client()