Ejemplo n.º 1
0
def _test_query_origin_hive(threshold):
    print("Running on Log")
    with cd(SCRIPT_DIR):
        hadoop.run_hive('-e "%s"' % (HIVE_SQL_TEMPLATE % ('log', threshold)))
    print("Running on Log without index.")
    with cd(SCRIPT_DIR):
        hadoop.run_hive('-e "%s"' % (HIVE_SQL_TEMPLATE % ('log_noidx', threshold)))
Ejemplo n.º 2
0
def import_hive_data(csvdir='csv', **kwargs):
    """Import dataset into Hive as external table (param:csvdir='csv',create_index=1)

    Optional parameters:
    @param create_index=False
    """
    do_create_index = kwargs.get('create_index', 1)
    csv_dir = os.path.join(SCRIPT_DIR, 'testdata/%s' % csvdir)

    with settings(warn_only=True):
        result = run("%(hadoop_bin)s/hadoop fs -test -d hdfs://%(head)s/csv" %
                     hadoop.env)
        if result.return_code == 0:
            run("%(hadoop_bin)s/hadoop fs -rmr hdfs://%(head)s/csv" %
                hadoop.env)
        run("%(hadoop_bin)s/hadoop fs -mkdir hdfs://%(head)s/csv" %
            hadoop.env)

    #print(_get_hadoop_csv_prefix(csv_dir))
    csv_prefixes = _get_hadoop_csv_prefix(csv_dir)
    execute(_hadoop_copy_from_local, csv_prefixes, hosts=env.workers)

    # Initialize Hive SQL
    init_sql = os.path.join(SCRIPT_DIR, 'testdata/hive.sql')
    with open(init_sql, 'w') as sqlfile:
        sqlfile.write("""use default;
DROP TABLE IF EXISTS log;
CREATE EXTERNAL TABLE log (time double, type string, event string,
value_name string, value double )
COMMENT "log table"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 'hdfs://%(head)s/csv';

DROP TABLE IF EXISTS log_noidx;
CREATE EXTERNAL TABLE log_noidx (time double, type string, event string,
value_name string, value double )
COMMENT "log table without index"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 'hdfs://%(head)s/csv';

SELECT * FROM log LIMIT 3;
SELECT * FROM log_noidx LIMIT 3;
""" % hadoop.env)
        if do_create_index:
            sqlfile.write("""
CREATE INDEX idx ON TABLE log(value_name)
AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
WITH DEFERRED REBUILD;
ALTER INDEX idx ON log REBUILD;
CREATE INDEX idx_value ON TABLE log(value)
AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
WITH DEFERRED REBUILD;
ALTER INDEX idx_value ON log REBUILD;
set hive.optimize.autoindex=true;
SHOW INDEX ON log;
""")

    with cd(SCRIPT_DIR):
        hadoop.run_hive("-f %s" % init_sql)
Ejemplo n.º 3
0
def test_query_hive(threshold=1000000, csv_dir='csv'):
    """Run query results on Hive (param:threshold=1000000)
    """
    threshold = int(threshold)

    #execute(_test_query_origin_hive, threshold)

    data_dir = '{}/testdata/{}'.format(SCRIPT_DIR, csv_dir)
    # Hive on VSFS
    with settings(warn_only=True):
        result = run("%(hadoop_bin)s/hadoop fs -test -d "
                     "hdfs://%(head)s/hivevsfs" % hadoop.env)
        if result.return_code == 0:
            run("%(hadoop_bin)s/hadoop fs -rmr hdfs://%(head)s/hivevsfs" %
                hadoop.env)
    run("%(hadoop_bin)s/hadoop fs -mkdir hdfs://%(head)s/hivevsfs" %
        hadoop.env)

    with cd(SCRIPT_DIR):
        run('rm -rf features && mkdir -p features')
    with settings(warn_only=True):
        execute(_extract_features, data_dir, threshold, hosts=env.workers)

    desired_files = []
    for filename in os.listdir('features'):
        file_path = os.path.join(SCRIPT_DIR, 'features', filename)
        with open(file_path) as fobj:
            for line in fobj:
                fields = line.split()
                print(fields)
                if fields[1] == 'Writer_5_runtime' and \
                    float(fields[2]) > threshold:
                    desired_files.append(fields[0])

    print("Desired files: ", desired_files)
    for filename in desired_files:
        run("%s/hadoop fs -copyFromLocal %s/%s hdfs://%s/hivevsfs" %
            (hadoop.env['hadoop_bin'], data_dir, filename, hadoop.env['head']))

    sql_create_vsfs_table = """
DROP TABLE IF EXISTS vsfs;
CREATE EXTERNAL TABLE vsfs (time double, type string, event string,
value_name string, value double)
COMMENT 'Hive On Vsfs'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 'hdfs://%(head)s/hivevsfs';
""" % hadoop.env
    hadoop.run_hive('-e "%s"' % sql_create_vsfs_table)

    with cd(SCRIPT_DIR):
        hadoop.run_hive('-e "%s"' % (HIVE_SQL_TEMPLATE % ('vsfs', threshold)))