def load_table(self): print("Load dim table start.") tbls = local_config()['tpcds_tbls'] thread_list = [] for tbl in tbls.keys(): if tbl in local_config()['partition_tbls']: continue print("Loading table %s..."%(tbl)) tbl_name = tbl et_tbl_name = tbls[tbl][0] create_tbl = """ use %s; drop table if exists %s; create table %s like %s stored as %s; set COMPRESSION_CODEC= %s; insert overwrite table %s select * from %s; """%(self.db_name,tbl_name,tbl_name,et_tbl_name,self.tbl_format,self.tbl_compress_codec,tbl_name,et_tbl_name) cmd = "impala-shell -i %s:21000 <<EOF %s \n EOF"%(local_config()['impalad_nodes'][0],create_tbl) t = threading.Thread(target=os.system,args=[cmd]) thread_list.append(t) t.start() for t in thread_list: if t.is_alive(): t.join() print("Load completed")
def gen_partition_tbl(tbl,start_thread_id,total_threads_num): tpcds_tool_root = local_config()['tpcds_tool_root'] thread_count = local_config()['dsdgen_threads_per_node'] thread_list = [] for thread_i in range(start_thread_id,start_thread_id + thread_count): cmd = "%s \ -TABLE %s \ -SCALE %s \ -CHILD %s \ -PARALLEL %s \ -DISTRIBUTIONS %s \ -TERMINATE N \ -FILTER Y \ -QUIET Y | hdfs dfs -put - %s.dat\ "%(os.path.join(tpcds_tool_root,'tools','dsdgen'), tbl, local_config()['tpcds_scale_factor'], thread_i, total_threads_num, os.path.join(tpcds_tool_root,'tools','tpcds.idx'), os.path.join(local_config()['flatfile_path_HDFS'],tbl,'_'.join([tbl,str(thread_i),str(total_threads_num)]))) t = threading.Thread(target = os.system,args=[cmd]) thread_list.append(t) t.start() for t in thread_list: if t.is_alive(): t.join() print('Generated')
def create_partition_tbl(self): for p_tbl in local_config()['partition_tbls']: tbl_name = p_tbl et_tbl_name = local_config()['tpcds_tbls'][p_tbl][0] #create partition tbl cmd = "impala-shell -i %s:21000 -d %s <<EOF %s \n EOF"%(local_config()['impalad_nodes'][0],self.db_name,self.partition_tbls[p_tbl]) os.system(cmd)
def push(): hosts = local_config()['master'] hosts.extend(local_config()['impalad_nodes']) cur_path = os.path.dirname(os.path.realpath(__file__)) for host in hosts: #mkdir os.system("ssh %s mkdir -p %s"%(host,local_config()['work_dir'])) os.system('scp -r %s/* %s:%s'%(cur_path,host,local_config()['work_dir'])) return hosts
def __init__(self): self.db_format = local_config()['db_format'] self.tbl_format = local_config()['tbl_format_dic'][self.db_format[0]] self.tbl_compress_codec = local_config()['tbl_compress_codec_dic'][self.db_format[1]] self.db_name = local_config()['db_name'] with open(os.path.join(local_config()['tpcds_tool_root'],'distinct-ss-sold-date.txt'), 'r') as f: self.ss_sold_dates = sorted([d.strip() for d in f.readlines()]) self.init_tbl()
def StoreStockPriceInfo(market,Stock_List,queue): with open(os.path.join(local_config()['install_path'],'output.csv'),'w',encoding='utf-8') as fp_w: consumer_num = local_config()['consumer_num'] lock = Lock() consumer_proc_list = [] for i in range(consumer_num): each_job = StoreStockPriceInfo_job(market) consumer_proc = Thread(target =each_job.StorePriceInfo ,args=[queue,lock,fp_w]) consumer_proc.start() consumer_proc_list.append(consumer_proc) for proc in consumer_proc_list: if proc.is_alive(): proc.join()
def main(): #program log path datetime_now = str(datetime.datetime.now().strftime("DATE%Y-%m-%d_TIME%H-%M")) #datetime_now = str("DATE2015-12-21_TIME13-24") start_timestamp = time.mktime(time.strptime(datetime_now,"DATE%Y-%m-%d_TIME%H-%M")) log_dir = './queries/logs/%s'%(datetime_now) if not os.path.isdir(log_dir): os.system('mkdir -p %s'%(log_dir)) start = time.time() with open("%s/renderman.log"%(log_dir),'w') as logging: for user_number in local_config()['user_number']: start_timestamp = time.mktime(time.strptime(datetime.datetime.now().strftime("DATE%Y-%m-%d_TIME%H-%M"),"DATE%Y-%m-%d_TIME%H-%M")) #init & start consumer base_log_dir = os.path.join(log_dir,''.join(['user_number=',str(user_number)])) user_list = [] clear_cache() for i in xrange(user_number): u = Process(target=distinct_user.user, args = [i,base_log_dir,logging]) user_list.append(u) for user in user_list: user.start() while True: running = 0 for user in user_list: if user.is_alive(): running = running + 1 if running == 0 : break #collect logs ##remote_log print("Preparing remote logs") for desired_log in local_config()['desired_logs']['remote']: cmd_list = map(lambda x : 'ssh %s "python /home/wangdewei/extractLog.py %s %s"'%(x,str(start_timestamp),desired_log),local_config()['impalad_nodes']) res = [os.system(cmd) for cmd in cmd_list] # scp scp_list = map(lambda x: 'scp %s:/home/wangdewei/extractLog/* %s'%(x,base_log_dir),local_config()['impalad_nodes']) res = [os.system(scp) for scp in scp_list] ##local print("Preparing local logs") for desired_log in local_config()['desired_logs']['local']: cmd = 'python extractLog.py %s %s %s'%(desired_log,str(start_timestamp),base_log_dir) print(cmd) # os.system(cmd) collect_result(log_dir) end = time.time() duration = end - start print 'run in %f seconds'%(duration)
def run_testing(self): conn = connect(host='172.168.0.24', port=21050,database = 'tpcds_text_150') cursor = conn.cursor() count = 0 cur_task = 0 query_time = [] with open(os.path.join(self.profile_path,'RESULT'),'w') as fp_w: for query in self.query_list: if query not in self.des_dic.keys(): self.des_dic[query] = {} for cpufreq in self.cpufreq_range: if cpufreq not in self.des_dic[query].keys(): self.CPUFreqSet_OBJ.set(cpufreq) self.des_dic[query][cpufreq] = [] start_time = time.time() #for i in range(self.every_query_times): with open(os.path.join(local_config()['query_dir'],query),'r') as fp: sql= fp.read().strip('profile;\n') sql = sql.strip('; ') while True: query_start_time = time.time() cursor.execute('%s'%(sql)) end_time = time.time() query_time.append(end_time - query_start_time) while True: row=cursor.fetchone() if row: pass else: break cur_profile = cursor.get_profile() count = count + 1 with open(os.path.join(self.profile_path,str(count)+'.log'),'w') as fp_profile: fp_profile.write("%s"%(cur_profile)) if end_time - start_time > local_config()['duration_time'] : break self.des_dic[query][cpufreq].sort() print "%s,%s,%s"%(self.users_InTotal,self.user_name,count) #print >>self.logging,"%s,%s,%s"%(self.profile_path,self.user_name,count) fp_w.write("%s,%s,%s,%s"%(self.users_InTotal,self.user_name,count, query_time)) cursor.close() conn.close() return self.des_dic
def get_tuple_task_dic(): task_dic = {} if os.path.isdir(local_config()['impaladlog_path']): for root,dirs,files in os.walk(local_config()['impaladlog_path']): for file in files: if file.endswith('.hehe024.impaladlog'): query_name = file.split('.')[0] if query_name not in task_dic.keys(): task_dic[query_name] = [os.path.join(root,file),'tuple_descriptor_'+query_name+'.xml'] elif os.path.isfile(local_config()['impaladlog_path']): file = os.path.split(local_config()['impaladlog_path'])[-1] query_name = file.split('.')[0] if query_name not in task_dic.keys(): task_dic[query_name] = [local_config()['impaladlog_path'],'tuple_descriptor_'+query_name+'.xml'] return task_dic
def get_task_dic(): task_dic = {} if os.path.isdir(local_config()['profile_path']): for root,dirs,files in os.walk(local_config()['profile_path']): for file in files: if file.endswith('.sql.log'): query_name = file.split('.')[0] if query_name not in task_dic.keys(): task_dic[query_name] = [os.path.join(root,file),query_name+'.sql.xml'] elif os.path.isfile(local_config()['profile_path']): file = os.path.split(local_config()['profile_path'])[-1] query_name = file.split('.')[0] if query_name not in task_dic.keys(): task_dic[query_name] = [local_config()['profile_path'],query_name+'.sql.xml'] return task_dic
def hdfs_mdir(): cmd = "hdfs dfs -mkdir -p %s" root_dir = local_config()['flatfile_path_HDFS'] overwrite = local_config()['overwrite'] with os.popen('hdfs dfs -test -d %s;echo $?'%(root_dir)) as fp: tmp = fp.read().strip('\n') if tmp == '0': if not overwrite : #directory is exists ,no need to generate data print('%s is exists.There is no need to generate data'%(root_dir)) return False else: os.system('hdfs dfs -rm -r -f %s'%(root_dir)) dirs = list(map(lambda x :os.path.join(root_dir,x) ,local_config()['tpcds_tbls'].keys())) for hdfs_dir in dirs: print("Making directory:%s"%(hdfs_dir)) os.system(cmd%(hdfs_dir)) return True
def StorePriceInfo(self,queue,lock,output_media): output_type = local_config()['output_type'] while True: lock.acquire() if output_type == 'csv': while not queue.empty(): temp = queue.get() + str(queue.qsize()) +'\n' output_media.write(temp) time.sleep(2) lock.release()
def __init__(self, log_file,xml_filename,logging): self.logging = logging self.xml_filename = xml_filename with open(log_file,'r') as fp: self.logfile = fp.read() self.part_impalad = (0, self.logfile.find('+----------+')) self.part_queryResult = (self.part_impalad[1], self.logfile.find('Query Runtime Profile', self.part_impalad[1] + 1)) self.part_queryRuntime = (self.part_queryResult[1], self.logfile.find('----------------', self.part_queryResult[1] + 1)) self.part_planFragment = (self.part_queryRuntime[1], self.logfile.find('----------------', self.part_queryRuntime[1] + 1)) self.part_eachnodecosts = (self.part_planFragment[1], self.logfile.find(' Coordinator Fragment', self.part_planFragment[1] + 1)) self.part_plannode = (self.part_eachnodecosts[1], self.logfile.find('\n Fragment F',self.part_eachnodecosts[1]+1)) self.part_instances = (self.part_plannode[1],len(self.logfile)) #1.plan.xml self.obj_xml = xml_writers(xml_file = os.path.join(local_config()['install_dir'],'output_dir',(self.xml_filename)),query_name = xml_filename[0:-4]) self.dom = self.obj_xml.create_dom() self.obj_node = xml_node(self.dom) #2.hbase.dist self.hbase_dist_file = os.path.join(local_config()['install_dir'],'output_dir',(self.xml_filename[0:-3]+'hbase.dist'))
def gen_dims(): root_dir = local_config()['flatfile_path_HDFS'] proc_list = [] print('Generating table flat file!') for tbl in local_config()['tpcds_tbls'].keys(): if tbl in local_config()['partition_tbls']: continue print('%s is generating......'%(tbl)) cmd = "%s \ -TABLE %s \ -SCALE %s \ -DISTRIBUTIONS %s \ -TERMINATE N \ -FILTER Y \ -QUIET Y |hdfs dfs -put - %s.dat \ "%(os.path.join(local_config()['tpcds_tool_root'],'tools','dsdgen'), tbl, local_config()['tpcds_scale_factor'], os.path.join(local_config()['tpcds_tool_root'],'tools','tpcds.idx'), os.path.join(root_dir,tbl,tbl)) proc = Process(target=os.system,args=[cmd]) proc_list.append(proc) proc.start() for proc in proc_list: if proc.is_alive(): proc.join() print('Generated')
def gen_facts(hosts): tbls = local_config()['partition_tbls'] tpcds_tool_root = local_config()['tpcds_tool_root'] print('Partitioned table is generating flat file!') for tbl in tbls: print('%s is generating......'%(tbl)) proc_list = [] thread_count = local_config()['dsdgen_threads_per_node'] total_threads_num = len(hosts) * thread_count for i,host in enumerate(hosts): start_thread_id = i*thread_count + 1 cmd = "ssh %s 'cd %s;python gen_partition_table.py %s %s %s'"%(host, local_config()['work_dir'], tbl, str(start_thread_id), str(total_threads_num)) proc = Process(target = os.system,args=[cmd]) proc_list.append(proc) proc.start() for proc in proc_list: if proc.is_alive(): proc.join()
def user(user_name,baselog_dir,logging): #ex_wb = excel_writer(datetime_now + '.xlsx') log_dir = os.path.join(baselog_dir,str(user_name)) if not os.path.isdir(log_dir): os.system("mkdir -p %s"%(log_dir)) for test_case in local_config()['test_case_list']: if test_case not in testing.TestingClassDict.keys(): print testing.TestingClassDict print '%s has not been completed...'%(test_case) else: test_obj = testing.TestingClassDict[test_case](logging,log_dir,user_name) test_obj.run_testing()
def __init__(self,logging,log_dir,user_name): self.impalad_nodes = local_config()['impalad_nodes'] self.cpufreq_range = local_config()['CPUFreq_range'] self.query_list = local_config()['query_list'] self.query_dir = local_config()['query_dir'] self.impala_server = local_config()['IMPALA_SERVER'] self.database_name = local_config()['DATABASE_NAME'] self.sh_path = os.path.join(local_config()['install_path'],'queries','runsql.sh') self.profile_path = log_dir self.network = local_config()['net_work'] self.des_dic = {} self.CPUFreqSet_OBJ = CPUFreqSet() self.check_sys_state_obj = check_sys_state() self.logging = logging self.user_name = user_name self.users_InTotal = os.path.split(os.path.split(self.profile_path)[0])[1]
def GetStockPriceInfo(market,Stock_List,queue): step_size = local_config()['step_size'] partitions = [Stock_List[i:i+step_size] for i in range( 0,\ len(Stock_List),\ step_size)] proc_list = [] lock = Lock() for part in partitions: all_url = ''.join([str_i + ',' for str_i in part]) each_job = GetStockpriceInfo_job(market,all_url) proc = Thread(target = each_job.GetPriceInfo,args=[queue,lock]) proc.start() proc_list.append(proc) for proc in proc_list: if proc.is_alive(): proc.join()
def load_partition_tbl_store_sales(self): num_part_per_query = 200 partition_ranges = [self.ss_sold_dates[i: i + num_part_per_query] for i in range(0, len(self.ss_sold_dates), num_part_per_query)] assert sum([len(r) for r in partition_ranges]) == len(self.ss_sold_dates) thread_list = [] for partition_range in partition_ranges: query = """ insert overwrite table store_sales partition(ss_sold_date_sk) [shuffle] select ss_sold_time_sk, ss_item_sk, ss_customer_sk, ss_cdemo_sk, ss_hdemo_sk, ss_addr_sk, ss_store_sk, ss_promo_sk, ss_ticket_number, ss_quantity, ss_wholesale_cost, ss_list_price, ss_sales_price, ss_ext_discount_amt, ss_ext_sales_price, ss_ext_wholesale_cost, ss_ext_list_price, ss_ext_tax, ss_coupon_amt, ss_net_paid, ss_net_paid_inc_tax, ss_net_profit, ss_sold_date_sk from et_store_sales where ss_sold_date_sk between {0} and {1};""".format(partition_range[0], partition_range[-1]) cmd = "impala-shell -i %s:21000 -d %s <<EOF %s \nEOF"%(local_config()['impalad_nodes'][0],self.db_name,query) print (cmd) t = threading.Thread(target=os.system,args=[cmd]) thread_list.append(t) t.start() for t in thread_list: if t.is_alive(): t.join()
def __init__(self,pl_i,plnode,logging,LR): self.attri_dic = {} self.logging = logging self.attri_dic['lr'] = LR with_scan_hdfs = False if pl_i == 0 : self.attri_dic['is_plan_root'] = 'true' else: self.attri_dic['is_plan_root'] = 'false' for line in plnode.split('\n'): line = line.strip() line = line.strip('|') line = line.strip('--') line = line.strip() if re.search('\d+:',line) and 'nid' not in self.attri_dic.keys(): line_key,line_val = line.split(':') self.attri_dic['nid'] = str(int(line_key)) node_type,node_type_attri = (' [' in line_val) and line_val.split(' [') or [line_val,None] self.attri_dic['display_name'] = node_type if node_type_attri: node_type_attri = node_type_attri.strip(']') if re.match('HASH JOIN',node_type): join_type,dist_type = node_type_attri.split(', ') self.attri_dic['join_type'] = join_type self.ana_Distributiontype(dist_type) elif re.match('SCAN HDFS', node_type): scan_table,dist_type = node_type_attri.split(', ') scan_db,scan_tbl = scan_table.split('.') if ' ' in scan_db: scan_db = scan_db.split(' ')[0] if ' ' in scan_tbl: scan_tbl = scan_tbl.split(' ')[0] self.attri_dic['database_name'] = scan_db self.attri_dic['table'] = scan_tbl #self.ana_Distributiontype(dist_type) if local_config()['partitioned_table_name'] in node_type_attri: with_scan_hdfs = True elif re.match('SCAN HBASE', node_type): scan_table = node_type_attri scan_db,scan_tbl = scan_table.split('.') if ' ' in scan_db: scan_db = scan_db.split(' ')[0] if ' ' in scan_tbl: scan_tbl = scan_tbl.split(' ')[0] self.attri_dic['database_name'] = scan_db self.attri_dic['table'] = scan_tbl # #self.ana_Distributiontype(dist_type) # if local_config()['partitioned_table_name'] in node_type_attri: # with_scan_hdfs = True elif re.match('TOP-N', node_type): self.attri_dic['limit'] = str(int(node_type_attri.split('=')[1])) else: self.ana_Distributiontype(node_type_attri) #NODE:MERGE-EXCHANGE maybe has limit attribute m = re.match('limit:\s\d+',line) if m: self.attri_dic['limit'] = str(int(line.split(': ')[1])) if self.attri_dic['display_name'] == 'SCAN HDFS': m = re.match('partitions=\d+/\d+',line) if m: self.attri_dic['partitions'] = str(m.group(0).split('=')[1]) #data percentage,只需执行一次,获得data_percentage属性 if with_scan_hdfs: self.collect_data_percentage(lines=plnode.split('\n'),total_table_size_in_bytes = local_config()['total_table_size_in_bytes']) with_scan_hdfs = False if 'tuple-ids' in line \ and 'row-size' in line \ and 'cardinality' in line: tuple_ids,row_size,cardinality, = line.split(' ') self.attri_dic['tuple-ids'] = tuple_ids.split('=')[1] self.attri_dic['row-size'] = str(int(row_size.split('=')[1][:-1]))
from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base from config import prod_config,local_config,checkENV DB_URL='' if checkENV()=='LOCAL': # it is local envrn lc=local_config() DB_URL=lc.DB_URI print('database.py called in Local environment '+str(DB_URL)) else: pc=prod_config() DB_URL=pc.DB_URI print('database.py called in Prod environment URL->'+str(DB_URL)) engine = create_engine(DB_URL, echo=False) Session = sessionmaker(autocommit=False,autoflush=False,bind=engine) Base=declarative_base() metadata = Base.metadata def addUser(name, age, email, phone, dose_no,search_by,pincode ,state_id=-1,dist_id=-1,dist_name='NA'): from cowin_get_email.databases.user_model import addUser
if os.path.isdir(local_config()['impaladlog_path']): for root,dirs,files in os.walk(local_config()['impaladlog_path']): for file in files: if file.endswith('.hehe024.impaladlog'): query_name = file.split('.')[0] if query_name not in task_dic.keys(): task_dic[query_name] = [os.path.join(root,file),'tuple_descriptor_'+query_name+'.xml'] elif os.path.isfile(local_config()['impaladlog_path']): file = os.path.split(local_config()['impaladlog_path'])[-1] query_name = file.split('.')[0] if query_name not in task_dic.keys(): task_dic[query_name] = [local_config()['impaladlog_path'],'tuple_descriptor_'+query_name+'.xml'] return task_dic if __name__ == '__main__': if 'Linux' in local_config()['current_sys']: os.system('rm -f ./output_dir/* ') elif 'Windows' in local_config()['current_sys']: os.system('del .\output_dir\* /q ') #Profile analysis if local_config()['sw_input'] is True: task_dic = get_task_dic() for task in task_dic.keys(): print(task_dic[task][0]) #1.xml obj = class_log(task_dic[task][0],task_dic[task][1],logging) obj.getAttri() obj.writeToXML() #hbase dist obj.writeToHBASEdist()
def init_tbl(self): #create external table self.tbl_dic = { "et_store_sales":""" create external table et_store_sales ( ss_sold_date_sk int, ss_sold_time_sk int, ss_item_sk int, ss_customer_sk int, ss_cdemo_sk int, ss_hdemo_sk int, ss_addr_sk int, ss_store_sk int, ss_promo_sk int, ss_ticket_number int, ss_quantity int, ss_wholesale_cost double, ss_list_price double, ss_sales_price double, ss_ext_discount_amt double, ss_ext_sales_price double, ss_ext_wholesale_cost double, ss_ext_list_price double, ss_ext_tax double, ss_coupon_amt double, ss_net_paid double, ss_net_paid_inc_tax double, ss_net_profit double ) row format delimited fields terminated by '|' location '%s/store_sales' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_customer_demographics":""" create external table et_customer_demographics ( cd_demo_sk int, cd_gender string, cd_marital_status string, cd_education_status string, cd_purchase_estimate int, cd_credit_rating string, cd_dep_count int, cd_dep_employed_count int, cd_dep_college_count int ) row format delimited fields terminated by '|' location '%s/customer_demographics' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_date_dim":""" create external table et_date_dim ( d_date_sk int, d_date_id string, d_date string, -- YYYY-MM-DD format d_month_seq int, d_week_seq int, d_quarter_seq int, d_year int, d_dow int, d_moy int, d_dom int, d_qoy int, d_fy_year int, d_fy_quarter_seq int, d_fy_week_seq int, d_day_name string, d_quarter_name string, d_holiday string, d_weekend string, d_following_holiday string, d_first_dom int, d_last_dom int, d_same_day_ly int, d_same_day_lq int, d_current_day string, d_current_week string, d_current_month string, d_current_quarter string, d_current_year string ) row format delimited fields terminated by '|' location '%s/date_dim' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_time_dim":""" create external table et_time_dim ( t_time_sk int, t_time_id string, t_time int, t_hour int, t_minute int, t_second int, t_am_pm string, t_shift string, t_sub_shift string, t_meal_time string ) row format delimited fields terminated by '|' location '%s/time_dim' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_item":""" create external table et_item ( i_item_sk int, i_item_id string, i_rec_start_date string, i_rec_end_date string, i_item_desc string, i_current_price double, i_wholesale_cost double, i_brand_id int, i_brand string, i_class_id int, i_class string, i_category_id int, i_category string, i_manufact_id int, i_manufact string, i_size string, i_formulation string, i_color string, i_units string, i_container string, i_manager_id int, i_product_name string ) row format delimited fields terminated by '|' location '%s/item' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_store":""" create external table et_store ( s_store_sk int, s_store_id string, s_rec_start_date string, s_rec_end_date string, s_closed_date_sk int, s_store_name string, s_number_employees int, s_floor_space int, s_hours string, s_manager string, s_market_id int, s_geography_class string, s_market_desc string, s_market_manager string, s_division_id int, s_division_name string, s_company_id int, s_company_name string, s_street_number string, s_street_name string, s_street_type string, s_suite_number string, s_city string, s_county string, s_state string, s_zip string, s_country string, s_gmt_offset double, s_tax_precentage double ) row format delimited fields terminated by '|' location '%s/store' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_customer":""" create external table et_customer ( c_customer_sk int, c_customer_id string, c_current_cdemo_sk int, c_current_hdemo_sk int, c_current_addr_sk int, c_first_shipto_date_sk int, c_first_sales_date_sk int, c_salutation string, c_first_name string, c_last_name string, c_preferred_cust_flag string, c_birth_day int, c_birth_month int, c_birth_year int, c_birth_country string, c_login string, c_email_address string, c_last_review_date string ) row format delimited fields terminated by '|' location '%s/customer' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_promotion":""" create external table et_promotion ( p_promo_sk int, p_promo_id string, p_start_date_sk int, p_end_date_sk int, p_item_sk int, p_cost double, p_response_target int, p_promo_name string, p_channel_dmail string, p_channel_email string, p_channel_catalog string, p_channel_tv string, p_channel_radio string, p_channel_press string, p_channel_event string, p_channel_demo string, p_channel_details string, p_purpose string, p_discount_active string ) row format delimited fields terminated by '|' location '%s/promotion' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_household_demographics":""" create external table et_household_demographics ( hd_demo_sk int, hd_income_band_sk int, hd_buy_potential string, hd_dep_count int, hd_vehicle_count int ) row format delimited fields terminated by '|' location '%s/household_demographics' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_customer_address":""" create external table et_customer_address ( ca_address_sk int, ca_address_id string, ca_street_number string, ca_street_name string, ca_street_type string, ca_suite_number string, ca_city string, ca_county string, ca_state string, ca_zip string, ca_country string, ca_gmt_offset int, ca_location_type string ) row format delimited fields terminated by '|' location '%s/customer_address' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS']), "et_inventory":""" create external table et_inventory ( inv_date_sk int, inv_item_sk int, inv_warehouse_sk int, inv_quantity_on_hand int ) row format delimited fields terminated by '|' location '%s/inventory' tblproperties ('serialization.null.format'='') ;"""%(local_config()['flatfile_path_HDFS'])} self.partition_tbls = { 'store_sales':""" drop table if exists store_sales; create table store_sales ( ss_sold_time_sk int, ss_item_sk int, ss_customer_sk int, ss_cdemo_sk int, ss_hdemo_sk int, ss_addr_sk int, ss_store_sk int, ss_promo_sk int, ss_ticket_number int, ss_quantity int, ss_wholesale_cost double, ss_list_price double, ss_sales_price double, ss_ext_discount_amt double, ss_ext_sales_price double, ss_ext_wholesale_cost double, ss_ext_list_price double, ss_ext_tax double, ss_coupon_amt double, ss_net_paid double, ss_net_paid_inc_tax double, ss_net_profit double ) partitioned by (ss_sold_date_sk int) stored as %s; set COMPRESSION_CODEC=%s; """%(self.tbl_format, self.tbl_compress_codec) }
def __init__(self, FILE_NAME): self.wb = Workbook() self.ew = ExcelWriter(workbook=self.wb) self.filename = os.path.join(local_config()['install_path'],'queries/logs/%s/'%(FILE_NAME.split('.')[0]), FILE_NAME) self.ws_num = 0 self.ws = []
def create_db(self): #create database cmd = "impala-shell -i tracing024:21000 -q 'create database if not exists %s;'"%(local_config()['db_name']) os.system(cmd)
def create_external_table(self): for tbl in self.tbl_dic.keys(): cmd = "impala-shell -i %s:21000 -d %s <<EOF %s \nEOF"%(local_config()['impalad_nodes'][0],self.db_name,self.tbl_dic[tbl]) #print(cmd) os.system(cmd)
from config import local_config import os dbname = local_config()['DATABASE_NAME'] db_type = local_config()['db_type'] cache_dic = {'q43.sql':[2450816,2451179 ], 'q27.sql':[2450815,2451179 ], 'q65.sql':[2451911,2452275 ], 'q52.sql':[2451149,2451179 ], 'q19.sql':[2451484,2451513 ], 'q98.sql':[2451911,2451941 ], 'q63.sql':[2451911,2452275 ], 'q89.sql':[2451545,2451910 ], 'q79.sql':[2451180,2451269 ], 'q68.sql':[2451180,2451269 ], 'q42.sql':[2451149,2451179 ], 'q73.sql':[2451180,2451269 ], 'q53.sql':[2451911,2452275 ], 'q55.sql':[2452245,2452275 ], 'q7.sql' :[2450815,2451179 ], 'q3.sql' :[2451149,2451179], 'q34.sql':[2450816,2451910 ]} def to_cache(query_id): range_ = cache_dic[query_id] for range_i in range(range_[0],range_[1]): cmd = """ impala-shell -i tracing024:21000 -d %s -q "alter table store_sales_%s partition(ss_sold_date_sk = %d) set cached in 'testpool'" """%(dbname,db_type,int(range_i)) os.system(cmd) def un_cache():
class ParslFilter(logging.Filter): """Filter out Parsl debug logs""" def filter(self, record): return not (record.levelno == logging.DEBUG and '/parsl/' in record.pathname) for h in handlers: h.addFilter(ParslFilter()) logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, handlers=handlers) # Write the configuration # ML nodes: N for updating models, 1 for MolDQN, 1 for inference runs config = local_config(os.path.join(out_dir, 'run-info'), args.num_workers) # Save Parsl configuration with open(os.path.join(out_dir, 'parsl_config.txt'), 'w') as fp: print(str(config), file=fp) # Connect to the redis server client_queues, server_queues = make_queue_pairs(args.redishost, args.redisport, serialization_method="pickle", topics=['simulate', 'infer', 'train'], keep_inputs=False) # Apply wrappers to functions to affix static settings # Update wrapper changes the __name__ field, which is used by the Method Server def _fix_arguments(func, **kwargs): my_func = partial(func, **kwargs)
import os from class_xml.xml_writer import xml_writers,xml_node from xml.dom.minidom import * from log_analysis.getStructInfo import StructInfo from log_analysis.getDetailInfo import DetailInfo from log_analysis.getInstanceInfo import InstanceInfo from config import local_config from statistics_info.data_dist import server_idx_map server_idx_map = local_config()['server_idx_map'] import re class class_log: def __init__(self, log_file,xml_filename,logging): self.logging = logging self.xml_filename = xml_filename with open(log_file,'r') as fp: self.logfile = fp.read() self.part_impalad = (0, self.logfile.find('+----------+')) self.part_queryResult = (self.part_impalad[1], self.logfile.find('Query Runtime Profile', self.part_impalad[1] + 1)) self.part_queryRuntime = (self.part_queryResult[1], self.logfile.find('----------------', self.part_queryResult[1] + 1)) self.part_planFragment = (self.part_queryRuntime[1], self.logfile.find('----------------', self.part_queryRuntime[1] + 1)) self.part_eachnodecosts = (self.part_planFragment[1], self.logfile.find(' Coordinator Fragment', self.part_planFragment[1] + 1)) self.part_plannode = (self.part_eachnodecosts[1], self.logfile.find('\n Fragment F',self.part_eachnodecosts[1]+1)) self.part_instances = (self.part_plannode[1],len(self.logfile)) #1.plan.xml self.obj_xml = xml_writers(xml_file = os.path.join(local_config()['install_dir'],'output_dir',(self.xml_filename)),query_name = xml_filename[0:-4]) self.dom = self.obj_xml.create_dom() self.obj_node = xml_node(self.dom)