def _test_query_origin_hive(threshold): print("Running on Log") with cd(SCRIPT_DIR): hadoop.run_hive('-e "%s"' % (HIVE_SQL_TEMPLATE % ('log', threshold))) print("Running on Log without index.") with cd(SCRIPT_DIR): hadoop.run_hive('-e "%s"' % (HIVE_SQL_TEMPLATE % ('log_noidx', threshold)))
def import_hive_data(csvdir='csv', **kwargs): """Import dataset into Hive as external table (param:csvdir='csv',create_index=1) Optional parameters: @param create_index=False """ do_create_index = kwargs.get('create_index', 1) csv_dir = os.path.join(SCRIPT_DIR, 'testdata/%s' % csvdir) with settings(warn_only=True): result = run("%(hadoop_bin)s/hadoop fs -test -d hdfs://%(head)s/csv" % hadoop.env) if result.return_code == 0: run("%(hadoop_bin)s/hadoop fs -rmr hdfs://%(head)s/csv" % hadoop.env) run("%(hadoop_bin)s/hadoop fs -mkdir hdfs://%(head)s/csv" % hadoop.env) #print(_get_hadoop_csv_prefix(csv_dir)) csv_prefixes = _get_hadoop_csv_prefix(csv_dir) execute(_hadoop_copy_from_local, csv_prefixes, hosts=env.workers) # Initialize Hive SQL init_sql = os.path.join(SCRIPT_DIR, 'testdata/hive.sql') with open(init_sql, 'w') as sqlfile: sqlfile.write("""use default; DROP TABLE IF EXISTS log; CREATE EXTERNAL TABLE log (time double, type string, event string, value_name string, value double ) COMMENT "log table" ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 'hdfs://%(head)s/csv'; DROP TABLE IF EXISTS log_noidx; CREATE EXTERNAL TABLE log_noidx (time double, type string, event string, value_name string, value double ) COMMENT "log table without index" ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 'hdfs://%(head)s/csv'; SELECT * FROM log LIMIT 3; SELECT * FROM log_noidx LIMIT 3; """ % hadoop.env) if do_create_index: sqlfile.write(""" CREATE INDEX idx ON TABLE log(value_name) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD; ALTER INDEX idx ON log REBUILD; CREATE INDEX idx_value ON TABLE log(value) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD; ALTER INDEX idx_value ON log REBUILD; set hive.optimize.autoindex=true; SHOW INDEX ON log; """) with cd(SCRIPT_DIR): hadoop.run_hive("-f %s" % init_sql)
def test_query_hive(threshold=1000000, csv_dir='csv'): """Run query results on Hive (param:threshold=1000000) """ threshold = int(threshold) #execute(_test_query_origin_hive, threshold) data_dir = '{}/testdata/{}'.format(SCRIPT_DIR, csv_dir) # Hive on VSFS with settings(warn_only=True): result = run("%(hadoop_bin)s/hadoop fs -test -d " "hdfs://%(head)s/hivevsfs" % hadoop.env) if result.return_code == 0: run("%(hadoop_bin)s/hadoop fs -rmr hdfs://%(head)s/hivevsfs" % hadoop.env) run("%(hadoop_bin)s/hadoop fs -mkdir hdfs://%(head)s/hivevsfs" % hadoop.env) with cd(SCRIPT_DIR): run('rm -rf features && mkdir -p features') with settings(warn_only=True): execute(_extract_features, data_dir, threshold, hosts=env.workers) desired_files = [] for filename in os.listdir('features'): file_path = os.path.join(SCRIPT_DIR, 'features', filename) with open(file_path) as fobj: for line in fobj: fields = line.split() print(fields) if fields[1] == 'Writer_5_runtime' and \ float(fields[2]) > threshold: desired_files.append(fields[0]) print("Desired files: ", desired_files) for filename in desired_files: run("%s/hadoop fs -copyFromLocal %s/%s hdfs://%s/hivevsfs" % (hadoop.env['hadoop_bin'], data_dir, filename, hadoop.env['head'])) sql_create_vsfs_table = """ DROP TABLE IF EXISTS vsfs; CREATE EXTERNAL TABLE vsfs (time double, type string, event string, value_name string, value double) COMMENT 'Hive On Vsfs' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 'hdfs://%(head)s/hivevsfs'; """ % hadoop.env hadoop.run_hive('-e "%s"' % sql_create_vsfs_table) with cd(SCRIPT_DIR): hadoop.run_hive('-e "%s"' % (HIVE_SQL_TEMPLATE % ('vsfs', threshold)))