def setupAcidDataset(testsuite, LOCAL_DIR): ddl_location = None if testsuite == 'acid': ddl_location = os.path.join(LOCAL_DIR, "ddl", "acid-tpch-tablesetup.sql") elif testsuite == 'unbucketed': ddl_location = os.path.join(LOCAL_DIR, "ddl", "acid-tpch-unbucketed-tablesetup.sql") else: assert 1 == 0, "The testsuite passed in not correct. Please use value 'acid' or 'unbuckted'" # change timezone on test machines Machine.resetTimeZoneOnCluster() # Download TPCH acids data tpch_newdata_dir = os.path.join(LOCAL_DIR, "tpch_newdata_5G") TPCH_STAGE_TGZ = os.path.join(LOCAL_DIR, "tpch_newdata_5G.tgz") if not os.path.isfile(TPCH_STAGE_TGZ): assert util.downloadUrl(Config.get('hive', 'TPCH_NEWDATA_5G_DNLD_URL'), TPCH_STAGE_TGZ) Machine.tarExtractAll(TPCH_STAGE_TGZ, LOCAL_DIR) # Load the acid tables in Hive HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER') HDFS.createDirectory("/tmp/lineitem_acid", user=HADOOPQA_USER, perm='777', force=True) HDFS.copyFromLocal(os.path.join(tpch_newdata_dir, "lineitem*"), "/tmp/lineitem_acid", HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/lineitem_acid", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( ddl_location, hivevar={'HDFS_LOCATION': '/tmp'}, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Failed to populate the TPCH acid data in Hive"
def setupHS2ConcurrencyDataset(): logger.info("Setup test data") data_dir = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "hs2concur-test-data") data_tgz = os.path.join(Config.getEnv('WORKSPACE'), "hs2concur-test-data.tgz") if not os.path.isfile(data_tgz): assert util.downloadUrl(Config.get('hive', 'HS2CONCURR_TEST_DATA'), data_tgz) Machine.tarExtractAll(data_tgz, data_dir) # load data into HDFS hdfs_user = Config.get("hadoop", 'HDFS_USER') HDFS.createDirectory("/tmp/hs2data", user=hdfs_user, perm='777', force=True) HDFS.createDirectory("/tmp/hs2data/student", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'studenttab10k'), "/tmp/hs2data/student") HDFS.createDirectory("/tmp/hs2data/voter", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'votertab10k'), "/tmp/hs2data/voter") query = """drop table if exists student_txt; create external table student_txt (name string, age int, gpa double) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/student'; drop table if exists voter_txt; create external table voter_txt (name string, age int, registration string, contributions float) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/voter'; drop table if exists student; create table student (name string, age int, gpa double) CLUSTERED BY (name) INTO 20 BUCKETS STORED AS ORC TBLPROPERTIES('transactional'='true'); drop table if exists voter; create table voter (name string, age int, registration string, contributions float) CLUSTERED BY (name) INTO 20 BUCKETS STORED AS ORC TBLPROPERTIES('transactional'='true'); Insert into table student select * from student_txt; Insert into table voter select * from voter_txt;""" exit_code, stdout, stderr = Hive.runQueryOnBeeline(query, readFromFile=True, logoutput=True) assert exit_code == 0, "Test data creation failed"
def setupMondrianDataset(): DATABASE_NAME = 'foodmart' LOCAL_DATA_DIR = os.path.join(Config.getEnv('ARTIFACTS_DIR'), DATABASE_NAME) FOODMART_DDL = os.path.join(LOCAL_DATA_DIR, "foodmart.ddl") HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER') logger.info("Setup Mondrian dataset") if not os.path.exists(LOCAL_DATA_DIR): MONDRIAN_DATA_TGZ = LOCAL_DATA_DIR + ".tgz" assert util.downloadUrl(Config.get('hive', 'MONDRIAN_DATASET'), MONDRIAN_DATA_TGZ) Machine.tarExtractAll(MONDRIAN_DATA_TGZ, Config.getEnv('ARTIFACTS_DIR')) assert os.path.isdir(LOCAL_DATA_DIR) logger.info("create foodmart database and tables") HDFS.createDirectory("/tmp/mondrian", HADOOPQA_USER, perm='777', force=True) HDFS.copyFromLocal(LOCAL_DATA_DIR, "/tmp/mondrian", HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/mondrian", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( FOODMART_DDL, hivevar={ 'DB': 'foodmart', 'LOCATION': '/tmp/mondrian/foodmart' }, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Unable to deploy foodmart dataset"
def background_job_setup(cls, runSmokeTestSetup=True, config=None): ''' Create 5 input datasets for TestOrderedWordCount :param runSmokeTestSetup: Runs smoke test setup if set to true ''' logger.info("*** Start background job setup for Tez ***") Machine.rm(user=HADOOPQA_USER, host=None, filepath=LOCAL_WORK_DIR, isdir=True) os.mkdir(LOCAL_WORK_DIR) for i in range(0, 4, 1): inputDirName = "HDFS_INPUT%d" % i inputDirPath = os.path.join(LOCAL_WORK_DIR, inputDirName) HadoopJobHelper.runCustomWordWriter(LOCAL_WORK_DIR, inputDirPath, 10, 400, 10000) hdfsInputDir = "/user/%s/Input%d" % (HADOOPQA_USER, i) hdfsOutputDir = "/user/%s/output%d" % (HADOOPQA_USER, i) #In case already present, delete the input directory HDFS.deleteDirectory(hdfsInputDir) HDFS.createDirectory(hdfsInputDir) HDFS.deleteDirectory(hdfsOutputDir) HDFS.copyFromLocal(inputDirPath, hdfsInputDir) cls._hdfsInputList.append(hdfsInputDir + "/" + inputDirName) cls._hdfsOutputList.append(hdfsOutputDir) logger.info("Created data for input %d", i) logger.info("*** End background job setup for Tez ***")
def run(self): """ Move files to HDFS Input Dir after each interval period for n times. """ for count in range(0, self.times): text = "hello world \n Testing HDFS Word count Spark application" random_name = ''.join( random.choice(string.lowercase) for i in range(5)) filename = os.path.join(Config.getEnv('ARTIFACTS_DIR'), random_name) util.writeToFile(text, filename, isAppend=False) max_retry = 3 count = 0 while count < max_retry: try: if "hdfs://ns2" in self.hdfs_input_dir: cp_status = HDFS.copyFromLocal(filename, "hdfs://ns2/tmp", enableDebug=True) else: cp_status = HDFS.copyFromLocal(filename, "/tmp", enableDebug=True) assert cp_status[ 0] == 0, "Failed to copy file to HDFS 'tmp'" logger.info("copyFromLocal command finished for %s" % filename) if "hdfs://ns2" in self.hdfs_input_dir: mv_status = HDFS.mv(None, "hdfs://ns2/tmp/" + random_name, self.hdfs_input_dir, config=None) else: mv_status = HDFS.mv(None, "/tmp/" + random_name, self.hdfs_input_dir, config=None) assert mv_status[ 0] == 0, "Failed to move file from 'tmp' to test directory" except: if count < max_retry: count = count + 1 logger.info( "File copy into HDFS test directory failed after %s attempts, retrying after 120s sleep interval" % count) time.sleep(120) else: logger.error( "Failed to copy file into HDFS test directory, expect failures in HDFSWordCOunt" ) else: break logger.info("%s moved to %s" % (filename, self.hdfs_input_dir)) logger.info("sleeping for %s seconds" % self.interval) time.sleep(self.interval)
def checkClasspathVersion(cls, Version_Num, config=None): Local_Test_dir = os.path.join(Config.getEnv("WORKSPACE"), "tests", "rolling_upgrade", "yarn") Multi_Version_App_Dir = os.path.join(Local_Test_dir, "data") Mapper = "data/versionVerifyMapper.py" Reducer = "data/versionVerifyReducer.py" Verify_File_Name = "test.txt" Verify_Test_File = os.path.join(Multi_Version_App_Dir, Verify_File_Name) # Set up env mapred_app_path = MAPRED.getConfigValue( "mapreduce.application.framework.path", None) mapred_classpath = MAPRED.getConfigValue( "mapreduce.application.classpath", None) env = { "mapreduce.application.framework.path": mapred_app_path, "mapreduce.application.classpath": mapred_classpath } verifyInput = cls._hdfs_input + "/verify" HDFS.createDirectory(verifyInput, None, "777", False) # Copy template files for the verifier streaming job templateFile = open(Verify_Test_File, 'w') templateFile.write(Version_Num) templateFile.close() HDFS.copyFromLocal(Verify_Test_File, verifyInput, user=Config.get('hadoop', 'HADOOPQA_USER')) # Submit the special streaming job shortStreamingId = HadoopJobHelper.runStreamJob( Mapper, Reducer, verifyInput, cls._hdfs_output_verify, files=Multi_Version_App_Dir, config=config, extraJobArg=cls._jobArgs, env=env, proposedJobName=cls._shortStreamingName) MAPRED.waitForJobDoneOrTimeout(shortStreamingId, timeoutInSec=180) # Make sure task succeeded #assert YARN.getAppFinalStateFromID(appId) == 'SUCCEEDED' # Check result content retVal, checkContent = HDFS.cat(cls._hdfs_output_verify + '/part-00000') logger.info("CHECK CLASSPATH VERSION OUTPUT") logger.info(retVal) logger.info(checkContent) ruAssert("YARN", retVal == 0) ruAssert("YARN", 'True' in checkContent, "[VersionVerify] Stream job returns false: " + checkContent) #assert retVal == 0 #assert 'True' in checkContent, "Stream job returns false: " + checkContent #assert 'False' not in checkContent, "Stream job returns false: " + checkContent HDFS.deleteDirectory(cls._hdfs_output_verify, user=Config.get('hadoop', 'HADOOPQA_USER'))
def background_job_setup(cls, runSmokeTestSetup=True, config=None): ''' Upload Data to HDFS before Upgrade starts Creates /user/hrt_qa/test_rollingupgrade dir on HDFS Upload 20 files to /user/hrt_qa/test_rollingupgrade ''' if not cls._base_hdfs_dir: cls._base_hdfs_dir = '/user/%s/test_rollingupgrade' % Config.get( 'hadoop', 'HADOOPQA_USER') exit_code, stdout = HDFS.createDirectory(cls._base_hdfs_dir, force=True) ruAssert("HDFS", exit_code == 0, '[BGJobSetup] could not create dir on hdfs.') LOCAL_WORK_DIR = os.path.join(Config.getEnv('ARTIFACTS_DIR'), 'HDFS_RU_TEST') localTestWorkDir1 = os.path.join(LOCAL_WORK_DIR, "Temp_data") HadoopJobHelper.runCustomWordWriter(LOCAL_WORK_DIR, localTestWorkDir1, 20, 40, 1000) HDFS.copyFromLocal(os.path.join(localTestWorkDir1, "*"), cls._base_hdfs_dir) # set up for loadGenerator cls._lgTestDataDir = cls._base_hdfs_dir + '/testData' cls._lgTestOutputDir = cls._base_hdfs_dir + '/lg_job' cls._lgStructureDir = Machine.getTempDir() + "/structure" # test dir setup HDFS.deleteDirectory(cls._lgTestDataDir) HDFS.deleteDirectory(cls._lgTestOutputDir) command = "rm -rf " + cls._lgStructureDir exit_code, stdout = Machine.runas(Machine.getAdminUser(), command, None, None, None, "True", Machine.getAdminPasswd()) command = "mkdir " + cls._lgStructureDir stdout = Machine.runas(None, command, None, None, None, "True", None) Machine.chmod("777", cls._lgStructureDir, "True", Machine.getAdminUser(), None, Machine.getAdminPasswd()) HADOOP_TEST_JAR = cls.get_hadoop_test_jar() TEST_USER = Config.get('hadoop', 'HADOOPQA_USER') # structure generator jobCmd = 'jar %s NNstructureGenerator -maxDepth 5 -minWidth 2 -maxWidth 5 -numOfFiles 100 -avgFileSize 3 -outDir %s' % ( HADOOP_TEST_JAR, cls._lgStructureDir) exit_code, stdout = Hadoop.run(jobCmd) ruAssert("HDFS", exit_code == 0, "[BGJobSetup] StructureGenerator failed") # data generator jobCmd = 'jar %s NNdataGenerator -inDir %s -root %s' % ( HADOOP_TEST_JAR, cls._lgStructureDir, cls._lgTestDataDir) exit_code, stdout = Hadoop.run(jobCmd) ruAssert("HDFS", exit_code == 0, "[BGJobSetup] DataGenerator failed") if runSmokeTestSetup: logger.info("**** Running HDFS Smoke Test Setup ****") cls.smoke_test_setup()
def generate_test_data(cls, hdfs_test_dir, num_of_rows): test_data_file = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "sqooptest.dat") f = open(test_data_file, 'w') userid = 100000 for i in xrange(num_of_rows): f.write("%d,%d\n" % (userid + i, random.randint(10, 80))) f.close() HDFS.createDirectory(hdfs_test_dir, user=cls._hdfs_user, perm='777', force=True) HDFS.copyFromLocal(test_data_file, hdfs_test_dir)
def doSetup(cls, hdfs_test_dir, tbl_name, num_of_rows, type): from beaver.component.hive import Hive from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode logger.info("Generating test table dataset with %d rows" % num_of_rows) test_data_file = os.path.join(Config.getEnv('ARTIFACTS_DIR'), tbl_name + ".dat") f = open(test_data_file, 'w') userid = 100000 for i in xrange(num_of_rows): for j in range(random.randint(3, 8)): f.write("%d|%d\n" % (userid + i, random.randint(10, 80))) f.close() hdfs_tbl_dir = hdfs_test_dir + "/" + tbl_name logger.info("Copying the test dataset to HDFS directory '%s'" % hdfs_tbl_dir) HDFS.createDirectory(hdfs_test_dir, user=cls._hdfs_user, perm='777', force=True) HDFS.createDirectory(hdfs_tbl_dir, perm='777') HDFS.copyFromLocal(test_data_file, hdfs_tbl_dir) HDFS.chmod(cls._hdfs_user, '777', hdfs_tbl_dir) logger.info("Creating table '%s' and verification tables" % tbl_name) query = "drop table if exists %s;\n" % tbl_name query += "create external table %s (userid string, age int) row format delimited fields terminated by '|' stored as textfile location '%s';\n" % ( tbl_name, hdfs_tbl_dir) query += "drop table if exists %s_hive_verify;\n" % tbl_name query += "create table %s_hive_verify (userid string, age int);\n" % tbl_name if type == "Long running": for i in range(cls._num_of_webhcat_bgj): query += "drop table if exists %s_wh_%d;\n" % (tbl_name, i + 1) query += "create table %s_wh_%d (userid string, age int);\n" % ( tbl_name, i + 1) hivesetupfile = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "hivesetup.sql") util.writeToFile(query, hivesetupfile) exit_code, stdout = Hive.run("-f " + hivesetupfile, logoutput=False) if type: msg = "%s job setup for Hive component" % type if exit_code != 0: UpgradePerNode.reportProgress( "[FAILED][Hive][Setup] %s failed due to exitcode = %d" % (msg, exit_code)) else: UpgradePerNode.reportProgress( "[PASSED][Hive][Setup] %s finished successfully" % msg)
def smoke_test_setup(cls): ''' Setup required to run Smoke test ''' from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode UpgradePerNode.reportProgress("### Running Pig Smoke Test Setup ####") exit_code, stdout = HDFS.copyFromLocal(cls._golden_src_file, cls._hdfs_smoke_input_path) ruAssert("Pig", exit_code == 0, '[SmokeSetup] Smoke Test Data Load failed')
def configureOozieSpark(cls): cls.fix_qe_14910() # Workaround BUG-63500 oozie spark test cases are failing with org.apache.thrift.transport.TTransportException null sparkShareLibPath = cls.getLatestShareLibPath() + "/spark" SPARK_HOME = Config.get('spark', 'SPARK2_HOME') source = os.path.join(SPARK_HOME, "conf", "hive-site.xml") target_hive_site = os.path.join(sparkShareLibPath, "hive-site.xml") HDFS.deleteFile(target_hive_site, cls.getOozieUser()) HDFS.copyFromLocal(source, sparkShareLibPath, cls.getOozieUser()) isTez = Hadoop.isTez(True, False) if Hadoop.isTez: target_tez_site = os.path.join(sparkShareLibPath, "tez-site.xml") HDFS.deleteFile(target_tez_site, cls.getOozieUser()) HDFS.copyFromLocal( os.path.join(Config.get('tez', 'TEZ_CONF_DIR'), "tez-site.xml"), sparkShareLibPath, cls.getOozieUser()) exit_code, stdout = Oozie.share_lib_update() assert exit_code == 0
def downloadDataset(dataDir, dataTgz, downloadUrl, hdfsLocalCopy, textDataDir): HDFS.createDirectory(HCAT_TEST_DIR, user=HDFS_USER, perm='777', force=True) HDFS.createDirectory(HDFS_TEST_DIR, user=HDFS_USER, perm='777', force=True) # change timezone on test machines Machine.resetTimeZoneOnCluster() # Download the TPCDS dataset if not there if not os.path.isfile(dataTgz): assert util.downloadUrl(downloadUrl, dataTgz) Machine.tarExtractAll(dataTgz, dataDir) os.makedirs(hdfsLocalCopy) for filename in os.listdir(textDataDir): hdfs_localcopy_table_dir = os.path.join(hdfsLocalCopy, filename[:-4]) os.mkdir(hdfs_localcopy_table_dir) shutil.copy(os.path.join(textDataDir, filename), hdfs_localcopy_table_dir) HDFS.copyFromLocal(hdfsLocalCopy, HDFS_TEST_DIR) HDFS.chmod(None, '777', HDFS_TEST_DIR, recursive=True)
def setupTestData(stdauth=True): data_dir = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "hive-test-data") data_tgz = os.path.join(Config.getEnv('WORKSPACE'), "hive-simple-test-data.tgz") if not os.path.isfile(data_tgz): assert util.downloadUrl(Config.get('hive', 'HIVE_TEST_DATA'), data_tgz) Machine.tarExtractAll(data_tgz, data_dir) # load data into HDFS HDFS.createDirectory("/tmp/hs2data", user=HDFS_USER, perm='777', force=True) HDFS.createDirectory("/tmp/hs2data/student", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'studenttab10k'), "/tmp/hs2data/student") HDFS.createDirectory("/tmp/hs2data/voter", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'votertab10k'), "/tmp/hs2data/voter") query = """drop table if exists student; create external table student (name string, age int, gpa double) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/student'; drop table if exists voter; create external table voter (name string, age int, registration string, contributions float) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/voter';""" if stdauth: query += "\ngrant SELECT, INSERT, UPDATE, DELETE on table student to role public with grant option;" query += "\ngrant SELECT, INSERT, UPDATE, DELETE on table voter to role public with grant option;" exit_code, stdout, stderr = Hive.runQueryOnBeeline(query, readFromFile=True, logoutput=True) assert exit_code == 0, "Test data creation failed"
def doBackgroundJobSetup(cls, hdfs_test_dir): from beaver.component.hive import Hive from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode logger.info("Preparing the test setup for Hive background job") udfjar = os.path.join(Config.getEnv('WORKSPACE'), "tests", "hive", "hive-udf", "hive-udfs-0.1.jar") HDFS.createDirectory(hdfs_test_dir, user=cls._hdfs_user, perm='777', force=True) HDFS.copyFromLocal(udfjar, hdfs_test_dir) query = "drop function sleep; create function sleep as 'org.apache.hive.udf.generic.GenericUDFSleep' using jar 'hdfs://%s/hive-udfs-0.1.jar';" % hdfs_test_dir exit_code, stdout = Hive.runQuery(query) if exit_code != 0: UpgradePerNode.reportProgress( "[FAILED][Hive][Setup] Long running failed due to exitcode = %d" % exit_code) else: UpgradePerNode.reportProgress( "[PASSED][Hive][Setup] Long running finished successfully")
def setupMergeScaleDataset(LOCAL_DIR): # change timezone on test machines Machine.resetTimeZoneOnCluster() # Download the TPCH dataset if not there tpch_data_dir = os.path.join(LOCAL_DIR, "data") TPCH_DATA_TGZ = os.path.join(LOCAL_DIR, "tpch_data.tgz") if not os.path.isfile(TPCH_DATA_TGZ): assert util.downloadUrl(Config.get('hive', 'TPCH_DNLD_URL'), TPCH_DATA_TGZ) Machine.tarExtractAll(TPCH_DATA_TGZ, LOCAL_DIR) # Load the tables in Hive HADOOPQA_USER = Config.get("hadoop", 'HADOOPQA_USER') HDFS.createDirectory("/tmp/tpch", user=HADOOPQA_USER, perm='777', force=True) HDFS.copyFromLocal(tpch_data_dir, "/tmp/tpch", user=HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/tpch", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( os.path.join(LOCAL_DIR, "ddl", "merge-tpch-tablesetup.sql"), hivevar={'HDFS_LOCATION': '/tmp/tpch/data'}, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Failed to populate the TPCH data in Hive" # Download TPCH staging data tpch_stage_dir = os.path.join(LOCAL_DIR, "tpch_newdata_5G") TPCH_STAGE_TGZ = os.path.join(LOCAL_DIR, "tpch_newdata_5G.tgz") if not os.path.isfile(TPCH_STAGE_TGZ): assert util.downloadUrl(Config.get('hive', 'TPCH_NEWDATA_5G_DNLD_URL'), TPCH_STAGE_TGZ) Machine.tarExtractAll(TPCH_STAGE_TGZ, LOCAL_DIR) # Load the staged tables in Hive HDFS.createDirectory( "/tmp/lineitem_stage /tmp/orders_stage /tmp/delete_stage", user=HADOOPQA_USER, perm='777', force=True ) HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "lineitem*"), "/tmp/lineitem_stage", HADOOPQA_USER) HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "order*"), "/tmp/orders_stage", HADOOPQA_USER) HDFS.copyFromLocal(os.path.join(tpch_stage_dir, "delete*"), "/tmp/delete_stage", HADOOPQA_USER) HDFS.chmod(None, 777, "/tmp/lineitem_stage /tmp/orders_stage /tmp/delete_stage", recursive=True) exit_code, stdout, stderr = Hive.runQueryOnBeeline( os.path.join(LOCAL_DIR, "ddl", "merge-staged-tpch-tablesetup.sql"), hivevar={'HDFS_LOCATION': '/tmp'}, logoutput=True, queryIsFile=True ) assert exit_code == 0, "Failed to populate the TPCH staging data in Hive"
def setupHS2ConcurrTestData(stdauth=True): # hive.support.concurrency is not in the whitelist, as this is a server setting and not something that user should/can set in a session. # In a case of Ranger and SQL std authorization, set hive.support.concurrency to true and restart HS2 changes = { 'hive-site.xml': { 'hive.txn.manager': 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager', 'hive.support.concurrency': 'true', 'hive.compactor.initiator.on': 'true', 'hive.compactor.worker.threads': '3', 'hive.compactor.check.interval': '10', 'hive.timedout.txn.reaper.interval': '20s' }, 'hiveserver2-site.xml': { 'hive.compactor.initiator.on': 'false', 'hive.exec.dynamic.partition.mode': 'nonstrict' } } if not Hive.isHive2(): changes['hiveserver2-site.xml']['hive.enforce.bucketing'] = 'true' else: changes['hiveserver2-site.xml']['hive.server2.enable.doAs'] = 'false' changes['hiveserver2-site.xml']['hive.txn.manager'] = 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager' changes['hiveserver2-site.xml']['hive.support.concurrency'] = 'true' Hive.modifyConfig(changes) time.sleep(60) data_dir = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "hs2concur-test-data") data_tgz = os.path.join(Config.getEnv('WORKSPACE'), "hs2concur-test-data.tgz") if not os.path.isfile(data_tgz): assert util.downloadUrl(Config.get('hive', 'HS2CONCURR_TEST_DATA'), data_tgz) Machine.tarExtractAll(data_tgz, data_dir) # load data into HDFS hdfs_user = Config.get("hadoop", 'HDFS_USER') test_user = Config.get("hadoop", 'HADOOPQA_USER') HDFS.createDirectory("/tmp/hs2data", user=test_user, perm='777', force=True) HDFS.createDirectory("/tmp/hs2data/student", user=test_user, perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'studenttab10k'), "/tmp/hs2data/student") HDFS.createDirectory("/tmp/hs2data/voter", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'votertab10k'), "/tmp/hs2data/voter") HDFS.createDirectory("/tmp/hs2data/customer_address", perm='777', force=True) HDFS.copyFromLocal(os.path.join(data_dir, 'customer_address10k'), "/tmp/hs2data/customer_address") query = """drop table if exists student; create external table student (name string, age int, gpa double) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/student'; drop table if exists voter; create external table voter (name string, age int, registration string, contributions float) row format delimited fields terminated by '\\t' stored as textfile location '/tmp/hs2data/voter'; drop table if exists customer_address; create external table customer_address (ca_address_sk int, ca_address_id string, ca_street_number string, ca_street_name string, ca_street_type string, ca_suite_number string, ca_city string, ca_county string, ca_state string, ca_zip string, ca_country string, ca_gmt_offset decimal(5,2), ca_location_type string) row format delimited fields terminated by '|' stored as textfile location '/tmp/hs2data/customer_address'; drop table if exists customer_address_partitioned; create table customer_address_partitioned (ca_address_sk int, ca_address_id string, ca_street_number string, ca_street_name string, ca_street_type string, ca_suite_number string, ca_city string, ca_county string, ca_state string, ca_zip string, ca_country string, ca_gmt_offset decimal(5,2)) partitioned by (ca_location_type string) clustered by (ca_state) into 50 buckets stored as orc tblproperties('transactional'='true'); insert into table customer_address_partitioned partition(ca_location_type) select ca_address_sk, ca_address_id, ca_street_number, ca_street_name, ca_street_type, ca_suite_number, ca_city, ca_county, ca_state, ca_zip, ca_country, ca_gmt_offset, ca_location_type from customer_address;""" if stdauth: query += "\ngrant SELECT, INSERT, UPDATE, DELETE on table student to role public with grant option;" query += "\ngrant SELECT, INSERT, UPDATE, DELETE on table voter to role public with grant option;" query += "\ngrant SELECT, INSERT, UPDATE, DELETE on table customer_address_partitioned to role public with grant option;" exit_code, stdout, stderr = Hive.runQueryOnBeeline(query, readFromFile=True, logoutput=True) assert exit_code == 0, "Test data creation failed"
def setupTableauDataset(): LOCAL_DATA_DIR = os.path.join(Config.getEnv('ARTIFACTS_DIR'), "tableau") DATA_DIR = os.path.join(LOCAL_DATA_DIR, 'data') SCHEMA_SQL_DIR = os.path.join(LOCAL_DATA_DIR, 'schema_3.0') HIVE_TABLES = [ 'Batters', 'Calcs', 'DateBins', 'DateTime', 'Election', 'FischerIris', 'Loan', 'NumericBins', 'REI', 'SeattleCrime', 'Securities', 'SpecialData', 'Staples', 'Starbucks', 'UTStarcom', 'xy' ] TABLEAU_TEST_DIR = "/user/hrt_qa/tableau" DATABASE_NAME = 'tableau' logger.info("Setup Tableau dataset") if not os.path.exists(LOCAL_DATA_DIR): TABLEAU_DATA_TGZ = LOCAL_DATA_DIR + ".tgz" assert util.downloadUrl(Config.get('hive', 'TABLEAU_DATASET'), TABLEAU_DATA_TGZ) Machine.tarExtractAll(TABLEAU_DATA_TGZ, Config.getEnv('ARTIFACTS_DIR')) assert os.path.isdir(LOCAL_DATA_DIR) logger.info("create test directory on hdfs to store tableau data files") HDFS.createDirectory(TABLEAU_TEST_DIR, user=HDFS_USER, perm='777', force=True) logger.info("create tableau database before creating tables") Hive.runQueryOnBeeline("DROP DATABASE IF EXISTS %s" % DATABASE_NAME) Hive.runQueryOnBeeline("CREATE DATABASE IF NOT EXISTS %s" % DATABASE_NAME) for tbl in HIVE_TABLES: hdfsDir = TABLEAU_TEST_DIR + '/%s' % tbl hdfsFile = hdfsDir + '/%s' % tbl localFile = os.path.join(DATA_DIR, '%s.tbl' % tbl) sqlFile = os.path.join(SCHEMA_SQL_DIR, '%s.sql' % tbl) logger.info("create directory for %s table" % tbl) exit_code, stdout = HDFS.createDirectory(hdfsDir, perm='777', force=True) assert exit_code == 0, 'Could not create dir for table %s on hdfs.' % tbl logger.info("copy file for table %s to hdfs" % tbl) exit_code, stdout = HDFS.copyFromLocal(localFile, hdfsFile) assert exit_code == 0, 'Could not copy file for table %s to hdfs.' % tbl logger.info("create %s table " % tbl) # thing-to-do Modify Hive.runQueryonBeeline to accept query file name exit_code, stdout, stderr = Hive.runQueryOnBeeline( ReadFromFile(sqlFile), readFromFile=True, hivevar={'HDFS_LOCATION': hdfsDir}, logoutput=True ) assert exit_code == 0, '%s table creation failed' % tbl
def background_job_setup(cls, runSmokeTestSetup=True, config=None): ''' Setup for background long running job Upload Data to HDFS before Upgrade starts Creates /user/hrt_qa/ru-pig dir on HDFS Creates and Upload large data file to /user/hrt_qa/ru-pig/input/ :param runSmokeTestSetup: Runs smoke test setup if set to true ''' from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode UpgradePerNode.reportProgress( "### Running Pig BackGround Job Setup ####") HDFS.deleteDirectory(cls._base_hdfs_dir) exit_code, stdout = HDFS.createDirectory(cls._base_hdfs_dir, user=cls._job_user, perm=777, force=True) ruAssert("Pig", exit_code == 0, '[BGJobSetup] could not create dir on hdfs.') HDFS.createDirectory(cls._hdfs_input_dir, force=True) srcFile = os.path.join(cls._artifacts_dir, 'pig-ru-input.txt') if os.path.exists(srcFile): os.remove(srcFile) tmpFile = os.path.join(cls._artifacts_dir, 'pig-ru-tmp-input.txt') if os.path.exists(tmpFile): os.remove(tmpFile) util.copyFileToAnotherFile(cls._golden_src_file, srcFile) util.copyFileToAnotherFile(srcFile, tmpFile) itr = 12 if Machine.isFlubber(): itr = 16 for i in range(itr): util.copyFileToAnotherFile(srcFile, tmpFile) util.copyFileToAnotherFile(tmpFile, srcFile) exit_code, stdout = HDFS.copyFromLocal(srcFile, cls._hdfs_input_path) ruAssert("Pig", exit_code == 0, '[BGJobSetup] Data Load failed') if runSmokeTestSetup: cls.smoke_test_setup()
def background_job_setup(cls, runSmokeTestSetup=True, config=None): ''' Upload Data to HDFS before Upgrade starts Creates /user/hrt_qa/falcon/ dir on HDFS Upload demo files to /user/hrt_qa/falcon ''' logger.info("Falcon - starting background job setup") from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode UpgradePerNode.reportProgress( "[INFO][Falcon][BGJobSetup] starting Falcon background job setup") from beaver.component.hadoop import HDFS if not cls._base_falcon_dir: cls._base_falcon_dir = '/user/%s/falcon' % cls._job_user exit_code, stdout = HDFS.createDirectory(cls._base_falcon_dir, user=cls._job_user, perm=777, force=True) ruAssert("Falcon", exit_code == 0, '[BGJobSetup] could not create dir on hdfs.') HDFS.copyFromLocal(os.path.join(cls._local_workspace, "tests", "rolling_upgrade", "falcon", "demo"), cls._base_falcon_dir, user=cls._job_user) ## Create dirs for falcon clusters exit_code, stdout = HDFS.createDirectory( "/apps/falcon/primaryCluster/staging", user=cls._falcon_user, perm=777, force=True) ruAssert("Falcon", exit_code == 0, '[BGJobSetup] could not create staging dir on hdfs ') exit_code, stdout = HDFS.createDirectory( "/apps/falcon/primaryCluster/working", user=cls._falcon_user, perm=755, force=True) ruAssert("Falcon", exit_code == 0, '[BGJobSetup] could not create dir on hdfs.') exit_code, stdout = HDFS.createDirectory( "/apps/falcon/backupCluster/staging", user=cls._falcon_user, perm=777, force=True) ruAssert("Falcon", exit_code == 0, '[BGJobSetup] could not create dir on hdfs.') exit_code, stdout = HDFS.createDirectory( "/apps/falcon/backupCluster/working", user=cls._falcon_user, perm=755, force=True) ruAssert("Falcon", exit_code == 0, '[BGJobSetup] could not create dir on hdfs.') ## Create cluster entities. cls.createClusterEntities("USWestOregon", "oregonHadoopCluster", "primaryCluster") cls.createClusterEntities("USEastVirginia", "virginiaHadoopCluster", "backupCluster") if runSmokeTestSetup: logger.info("**** Running Falcon Smoke Test Setup ****") cls.smoke_test_setup() logger.info("Falcon - completed background job setup") return
def formatNN_SetupHDFS(duReservedValue, mod_conf_path): """ Format NN. Setup HDFS dir for MR jobs. Note that this permission is too wide for default HDP use. """ datanodes = HDFS.getDatanodes() logger.info("datanodes = %s" % datanodes) HDFS.stopDatanodes() HDFS.stopNamenode() HDFS.formatNN(force=True, logoutput=True) for dn in datanodes: Machine.rm(user=Machine.getAdminUser(), host=dn, filepath="%s/current" % HDFS.getConfigValue("dfs.datanode.data.dir"), isdir=True) balancerModifyConfig(duReservedValue) HDFS.startNamenode(mod_conf_path) HDFS.startDatanodes(mod_conf_path) sleepTime = 45 logger.info("sleep for %s sec" % sleepTime) time.sleep(sleepTime) version = Hadoop.getShortVersion() paths = [ "/hdp", "/hdp/apps", "/hdp/apps/%s" % version, "/hdp/apps/%s/mapreduce" % version ] for path in paths: HDFS.mkdir(path=path, user=HDFS_USER) HDFS.chmod(runasUser=HDFS_USER, perm="777", directory="/hdp", recursive=True) HDFS.copyFromLocal( localpath="/usr/hdp/current/hadoop-client/mapreduce.tar.gz", hdfspath="/hdp/apps/%s/mapreduce/" % version) sleepTime = 45 logger.info("sleep for %s sec for MR tarball replication" % sleepTime) time.sleep(sleepTime) paths = [ "/app-logs", "/app-logs/hrt_qa", "/app-logs/hrt_qa/logs", "/mr-history" ] for path in paths: HDFS.mkdir(path=path, user=HDFS_USER) HDFS.chmod(runasUser=HDFS_USER, perm="777", directory="/app-logs", recursive=True) HDFS.chmod(runasUser=HDFS_USER, perm="777", directory="/mr-history", recursive=True) HDFS.mkdir(path="/user", user=HDFS_USER) HDFS.mkdir(path="/user/hrt_qa", user=HDFS_USER) HDFS.chown(runasUser=HDFS_USER, new_owner="hrt_qa:hrt_qa", directory="/user/hrt_qa", recursive=False) HDFS.chmod(runasUser="******", perm="770", directory="/user/hrt_qa", recursive=True)
def insertCSVDataViaCSVBuildLoad(cls, csvFile, tableName, putIntoHDFS=True, deleteAfterExec=True, runInBackground=False, user=None, config=None, optionAndParameter="", env=None, delimiter=",", arrayDelimiter=None, schema=None): """ By default, the files will be allocated under /tmp/ folder in HDFS. """ global ZK_ZPARENT if Slider.isSlider(): ZK_ZPARENT = util.getPropertyValueFromConfigXMLFile( os.path.join(Config.get('hbase', 'HBASE_CONF_DIR'), 'hbase-site.xml'), "zookeeper.znode.parent") if Machine.isLinux(): clientjar = Machine.find(user=Machine.getAdminUser(), host="localhost", filepath=PHOENIX_HOME, searchstr="phoenix-*[0-9]-client.jar", passwd=Machine.getAdminPasswd()) else: clientjar = Machine.find(user=Machine.getAdminUser(), host="localhost", filepath=PHOENIX_HOME, searchstr="phoenix-*-client.jar", passwd=Machine.getAdminPasswd()) if Machine.isWindows(): clientjar = (clientjar[0].strip("\\localhost")).replace("$", ":") fileName = csvFile.split('\\')[-1] else: clientjar = clientjar[0] fileName = csvFile.split('/')[-1] # If we need to, we insert it into HDFS, since the library will take it from there. executingUser = (HADOOPQA_USER) if user is None else user if putIntoHDFS: if not HDFS.fileExists('/tmp/'): HDFS.mkdir('/tmp/') HDFS.copyFromLocal(csvFile, '/tmp/', executingUser, config, optionAndParameter) hbaseConfDir = HBASE_CONF_DIR if Slider.isSlider(): hbaseConfDir = Config.get('hbase', 'HBASE_CONF_DIR') classpath = hbaseConfDir finalCommand = "%s jar %s org.apache.phoenix.mapreduce.CsvBulkLoadTool --table %s --input %s" \ % (HADOOP_CMD, clientjar, tableName, '/tmp/%s' % fileName) if schema is not None: finalCommand = '%s -schema %s' % (finalCommand, schema) if Machine.isWindows(): os.environ['HADOOP_USER_CLASSPATH_FIRST'] = 'true' os.environ['HADOOP_CLASSPATH'] = classpath if delimiter != "," or arrayDelimiter != None: finalCommand = "%s -d `\\\"`%s`\\\" -a `\\\"`%s`\\\"" \ % (finalCommand, delimiter, arrayDelimiter.strip("'")) finalCommand = "%s --zookeeper %s" % (finalCommand, ZK_HOST) if runInBackground: exit_code = 0 stdout = '' Machine.runinbackground( finalCommand, env=dict(env.items() + ENVIRONMENT.items() if env is not None else ENVIRONMENT)) else: exit_code, stdout = Machine.run( finalCommand, env=dict(env.items() + ENVIRONMENT.items() if env is not None else ENVIRONMENT)) else: # delimiter options if delimiter != "," or arrayDelimiter != None: finalCommand = "%s --delimiter %s --array-delimiter %s" % ( finalCommand, delimiter, arrayDelimiter) # ZKHosts options finalCommand = "%s --zookeeper %s" % (finalCommand, cls.getZKConnectString()) if runInBackground: exit_code = 0 stdout = '' Machine.runinbackground( "HADOOP_CLASSPATH=%s %s" % (classpath, finalCommand), env=dict(env.items() + ENVIRONMENT.items() if env is not None else ENVIRONMENT)) else: exit_code, stdout = Machine.run( "HADOOP_CLASSPATH=%s %s" % (classpath, finalCommand), env=dict(env.items() + ENVIRONMENT.items() if env is not None else ENVIRONMENT)) # If selected, after insertion into HBase we will delete the csvFile from HDFS if deleteAfterExec and not runInBackground: # Does not work for "run in background" option HDFS.deleteFile('/tmp/%s' % fileName, executingUser) # return 0,"" return exit_code, stdout