def test_def_level_encoding(self, vector, unique_database): """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a parquet table and running the parquet-reader tool on it, which performs sanity checking, such as that the correct number of definition levels were encoded. """ table_name = "test_hdfs_parquet_table_writer" qualified_table_name = "%s.%s" % (unique_database, table_name) self.execute_query( "create table %s stored as parquet as select l_linenumber from " "tpch_parquet.lineitem limit 180000" % qualified_table_name) tmp_dir = make_tmp_dir() try: hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' % (unique_database, table_name)) check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmp_dir]) for root, subdirs, files in os.walk(tmp_dir): for f in files: if not f.endswith('parq'): continue check_call([ os.path.join(impalad_basedir, 'util/parquet-reader'), '--file', os.path.join(tmp_dir, str(f)) ]) finally: self.execute_query("drop table %s" % qualified_table_name) rmtree(tmp_dir)
def start_cluster_using_rules(self, redaction_rules, log_level=2, vmodule=""): '''Start Impala with a custom log dir and redaction rules.''' self.tmp_dir = make_tmp_dir() os.chmod(self.tmp_dir, 0o777) LOG.info("tmp_dir is " + self.tmp_dir) os.mkdir(self.log_dir) os.mkdir(self.audit_dir) os.mkdir(self.profile_dir) # Write the redaction rules as set in @using_redaction_rules. with open(self.rules_file, 'w') as file: file.write(redaction_rules) self._start_impala_cluster([ """--impalad_args='-audit_event_log_dir=%s -profile_log_dir=%s -redaction_rules_file=%s -vmodule=%s'""" % (self.audit_dir, self.profile_dir, self.rules_file, vmodule) ], log_dir=self.log_dir, log_level=log_level) self.client = self.create_impala_client()
def test_def_level_encoding(self, vector, unique_database): """IMPALA-3376: Tests that parquet files are written to HDFS correctly by generating a parquet table and running the parquet-reader tool on it, which performs sanity checking, such as that the correct number of definition levels were encoded. """ table_name = "test_hdfs_parquet_table_writer" qualified_table_name = "%s.%s" % (unique_database, table_name) self.execute_query("drop table if exists %s" % qualified_table_name) self.execute_query("create table %s stored as parquet as select l_linenumber from " "tpch_parquet.lineitem limit 180000" % qualified_table_name) tmp_dir = make_tmp_dir() try: hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq' % (unique_database, table_name)) check_call(['hdfs', 'dfs', '-copyToLocal', hdfs_file, tmp_dir]) for root, subdirs, files in os.walk(tmp_dir): for f in files: if not f.endswith('parq'): continue check_call([os.path.join(impalad_basedir, 'util/parquet-reader'), '--file', os.path.join(tmp_dir, str(f))]) finally: self.execute_query("drop table %s" % qualified_table_name) rmtree(tmp_dir)
def _get_row_group_stats_from_hdfs_folder(self, hdfs_path): """Returns a list of statistics for each row group in all parquet files in 'hdfs_path'. The result is a two-dimensional list, containing stats by row group and column.""" row_group_stats = [] try: tmp_dir = make_tmp_dir() check_call(['hdfs', 'dfs', '-get', hdfs_path, tmp_dir]) for root, subdirs, files in os.walk(tmp_dir): for f in files: parquet_file = os.path.join(root, str(f)) row_group_stats.extend(self._get_row_group_stats_from_file(parquet_file)) finally: rmtree(tmp_dir) return row_group_stats
def start_cluster_using_rules(self, redaction_rules, log_level=2, vmodule=""): '''Start Impala with a custom log dir and redaction rules.''' self.tmp_dir = make_tmp_dir() os.chmod(self.tmp_dir, 0o777) LOG.info("tmp_dir is " + self.tmp_dir) os.mkdir(self.log_dir) os.mkdir(self.audit_dir) os.mkdir(self.profile_dir) # Write the redaction rules as set in @using_redaction_rules. with open(self.rules_file, 'w') as file: file.write(redaction_rules) self._start_impala_cluster( ["""--impalad_args='-audit_event_log_dir=%s -profile_log_dir=%s -redaction_rules_file=%s -vmodule=%s'""" % (self.audit_dir, self.profile_dir, self.rules_file, vmodule)], log_dir=self.log_dir, log_level=log_level) self.client = self.create_impala_client()