def main(): """ get the date for the past day (yesterday). """ timenow = int(time.time()) datenow = str(datetime.date.today()-datetime.timedelta(1)) datenow = datenow[0:4]+datenow[5:7]+datenow[8:10] print "###################" print "# Start processing the data back in " + datenow + " (yesterday)" print "# starting processing time is " + str(timenow) print "###################" ts = calendar.timegm(time.gmtime()) ts_last_hour = ts-3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s." % (datestamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_day % (datestamp), '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_day.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, datestamp, datestamp) f.close() print " **** perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hourstamp) try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. print " **** summarization failed, removed hdfs folder." hdfsutil.rm(config.hdfs_qos_rg_day % (datestamp), r=True) else: print " file exists."
def mrqos_table_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders score_partitions = hdfsutil.ls(config.hdfs_table_score) str_parts_list = [i.split('=', 1)[1] for i in score_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs timenow = int(time.time()) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] for item in mtype: exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item) str_parts_list = [i.split('=', 1)[1] for i in this_partitions] str_parts_list_int = map(int, str_parts_list) print " ## for table: %s" % item print " ## ", print str_parts_list_int for partition in str_parts_list_int: if partition < timenow - config.mrqos_table_delete: try: print " ## handling table: %s with ts=%s" % (item, str(partition)) # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(partition) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed in hive table clean up in table: %s." % item print e.message
def main(): # logging set-up logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_window_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting max_retrial = 10 ts = int(time.time()) datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts))) window_length = config.mrqos_join_delete + 1*24*60*60 datestamp_14d_ago = time.strftime('%Y%m%d', time.gmtime(float(ts-window_length))) logger.info('## Summarize IORATIO table started at %s.' % str(ts)) logger.info("direct summarize and insert into mrqos_sum_io.") # direct join and insert in hive f = open('/home/testgrp/MRQOS/mrqos_hive_query/MRQOS_table_summarize_ioratio.hive', 'r') strcmd = f.read() strcmd_s = strcmd % (str(datestamp), str(datestamp_14d_ago), str(datestamp)) f.close() logger.info(" **** perform beeline for ioratio join.") retrial = 0 while retrial < max_retrial: try: tic = time.time() beeline.bln_e(strcmd_s) logger.info('perform beeline for ioratio for 2W timeframe succeeded with time cost = %s second' % str(time.time()-tic)) except sp.CalledProcessError as e: retrial += 1 logger.error('perform beeline for ioratio for 2W timeframe failed.') logger.error('error message: %s', e.message)
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts-3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except: # delete the folder if summarization failed. print " **** summarization failed upto #retrials="+str(count_retrial) hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # check if the summary has been performed since the beginning of the day, last check on day X is X+1/0:30:00 for hour in hour_list: if hour < hourstamp: print " **** checking day = %s, hour = %s." % (datestamp, hour), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hour), '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hour, datestamp, hour, datestamp, hour) f.close() print " **** perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hour) try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. print " **** summarization failed, removed hdfs folder." hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hour), r=True) else: print " file exists."
def main(): datestamp = '20160316' hourstamp = '04' # test0 the original order of join f = open( os.path.join(config.mrqos_hive_query, 'test0_mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s1 = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() # test the reverse order of join f = open( os.path.join(config.mrqos_hive_query, 'test_mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s2 = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() fail_count = [0] * 2 time_count = [0] * 2 iter = 10 for item in range(iter): tic = time.time() fail0 = False fail1 = False try: beeline.bln_e(strcmd_s1) span1 = time.time() - tic time_count[0] += span1 except: span1 = time.time() - tic fail_count[0] += 1 fail0 = True tic = time.time() try: beeline.bln_e(strcmd_s2) span2 = time.time() - tic time_count[1] += span2 except: span2 = time.time() - tic fail_count[1] += 1 fail1 = True print "test0 takes %s (%s) and test1 takes %s (%s)" % ( str(span1), "failed" if fail0 else "ok", str(span2), "failed" if fail1 else "ok") print "<<< overall result >>>" print "test0 takes %s and test1 takes %s" % (str( time_count[0] / (iter - fail_count[0])), str(time_count[1] / (iter - fail_count[1])))
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts-3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] hour_list = [x for x in hour_list if x <= hourstamp] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) folders_day = '/'.join(str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1]) # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one if hdfsutil.test_dic(folders_day): hdfsutil.mkdir(folders_day) folders_in = [folders_day+'/hour=%s' % x for x in hour_list] folders_out = hdfsutil.ls(folders_day) folders_missing = [x for x in folders_in if x not in folders_out] folders_missing.sort(reverse=True) for item in folders_missing: hourstamp = item[-2:] print " **** missing data for day = %s, hour = %s." % (datestamp, hourstamp), f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " **** summarization failed upto #retrials="+str(count_retrial) print " **** ", print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1
def main(): datestamp = '20160316' hourstamp = '04' # test0 the original order of join f = open(os.path.join(config.mrqos_hive_query, 'test0_mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s1 = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() # test the reverse order of join f = open(os.path.join(config.mrqos_hive_query, 'test_mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s2 = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() fail_count = [0] * 2 time_count = [0] * 2 iter = 10 for item in range(iter): tic = time.time() fail0 = False fail1 = False try: beeline.bln_e(strcmd_s1) span1 = time.time()-tic time_count[0] += span1 except: span1 = time.time()-tic fail_count[0] += 1 fail0 = True tic = time.time() try: beeline.bln_e(strcmd_s2) span2 = time.time()-tic time_count[1] += span2 except: span2 = time.time()-tic fail_count[1] += 1 fail1 = True print "test0 takes %s (%s) and test1 takes %s (%s)" % (str(span1), "failed" if fail0 else "ok", str(span2), "failed" if fail1 else "ok") print "<<< overall result >>>" print "test0 takes %s and test1 takes %s" % (str(time_count[0]/(iter-fail_count[0])), str(time_count[1]/(iter-fail_count[1])))
def main(): # initialze the logger logging.basicConfig( filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) timenow = int(time.time()) datenow = str(datetime.date.today() - datetime.timedelta(1)) date_idx = datenow[0:4] + datenow[5:7] + datenow[8:10] # get the latest barebone day_idx bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1] logger.info("barebone index: day={}".format(bb_day_idx)) # get the latest mpd yesterday uuid_list = [ x.split('=')[-1] for x in hdfsutil.ls( os.path.join(os.path.dirname(config.hdfs_table), 'mapper', 'mapmon', 'day={}'.format(date_idx))) ] for uuid_idx in uuid_list: logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx)) file_location = os.path.join(config.hdfs_table, 'mapmon_sum', 'day={}'.format(date_idx), 'mpd_uuid={}'.format(uuid_idx)) if hdfsutil.test_dic(file_location): logger.info('creating folder: {}'.format(file_location)) hdfsutil.mkdir(file_location) if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')): f = open( os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx, uuid_idx, date_idx, uuid_idx) f.close() try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. logger.warn("summarization failed, removing hdfs folder.") hdfsutil.rm(file_location, r=True) else: logger.info(" file exists.")
def cleanup_mrqos_region_related_tables(datestamp, hour): tables = ['mrqos_region_hour', 'case_view_hour', 'region_view_hour'] for table_item in tables: try: # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table %s drop if exists partition(datestamp=%s, hour=%s)' % (table_item, str(datestamp), str(hour)) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, table_item, 'datestamp=%s' % str(datestamp), 'hour=%s' % str(hour)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError: print ">> failed in hive table clean up in table: %s for partition datestamp=%s, hour=%s." % (table_item, str(datestamp), str(hour)) pass
def main(): # logging set-up logging.basicConfig( filename=os.path.join(config.mrqos_logging, 'io_ratio_window_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting max_retrial = 10 ts = int(time.time()) datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts))) window_length = config.mrqos_join_delete + 1 * 24 * 60 * 60 datestamp_14d_ago = time.strftime('%Y%m%d', time.gmtime(float(ts - window_length))) logger.info('## Summarize IORATIO table started at %s.' % str(ts)) logger.info("direct summarize and insert into mrqos_sum_io.") # direct join and insert in hive f = open( '/home/testgrp/MRQOS/mrqos_hive_query/MRQOS_table_summarize_ioratio.hive', 'r') strcmd = f.read() strcmd_s = strcmd % (str(datestamp), str(datestamp_14d_ago), str(datestamp)) f.close() logger.info(" **** perform beeline for ioratio join.") retrial = 0 while retrial < max_retrial: try: tic = time.time() beeline.bln_e(strcmd_s) logger.info( 'perform beeline for ioratio for 2W timeframe succeeded with time cost = %s second' % str(time.time() - tic)) except sp.CalledProcessError as e: retrial += 1 logger.error( 'perform beeline for ioratio for 2W timeframe failed.') logger.error('error message: %s', e.message)
def main(): # initialze the logger logging.basicConfig(filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) timenow = int(time.time()) datenow = str(datetime.date.today()-datetime.timedelta(1)) date_idx = datenow[0:4]+datenow[5:7]+datenow[8:10] # get the latest barebone day_idx bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1] logger.info("barebone index: day={}".format(bb_day_idx)) # get the latest mpd yesterday uuid_list = [x.split('=')[-1] for x in hdfsutil.ls(os.path.join(os.path.dirname(config.hdfs_table),'mapper','mapmon','day={}'.format(date_idx)))] for uuid_idx in uuid_list: logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx)) file_location = os.path.join(config.hdfs_table, 'mapmon_sum', 'day={}'.format(date_idx), 'mpd_uuid={}'.format(uuid_idx)) if hdfsutil.test_dic(file_location): logger.info('creating folder: {}'.format(file_location)) hdfsutil.mkdir(file_location) if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx, uuid_idx, date_idx, uuid_idx) f.close() try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. logger.warn("summarization failed, removing hdfs folder.") hdfsutil.rm(file_location, r=True) else: logger.info(" file exists.")
def main(): # #### MRQOS region LOCAL PART #### # ignore the local timestamp, use what files are tagged timenow = time.time() print "###################" print "# Performing the hourly mrqos_region insert" print "# starting processing time is " + str(timenow) print "###################" list_qos_files = glob.glob( os.path.join(config.mrqos_data, 'qos_region.*.tmp') ) # glob get the full path for qos_file in list_qos_files: infoitem = qos_file.rsplit('.',2) ts = infoitem[-2] datestamp = time.strftime('%Y%m%d', time.localtime(float(ts))) # do we need hourly partition or not? hourstamp = time.strftime('%H', time.localtime(float(ts))) print ' file = ' + qos_file print ' timestamp = %s;' % ( ts ) # put the file to HDFS folder and remove from Local try: print ' upload to HDFS' hdfs_rg_destination = config.hdfs_qos_rg_info % ( datestamp, hourstamp, ts ) hdfs.mkdir( hdfs_rg_destination ) hdfs.put( qos_file, hdfs_rg_destination ) print ' adding partition' hiveql_str = config.add_rg_partition % ( datestamp, hourstamp, ts ) beeline.bln_e(hiveql_str) print ' remove local file: ' + qos_file os.remove(qos_file) except sp.CalledProcessError as e: print e.message print 'MRQOS region(RG) information update failed for timestamp=%s' % ( ts ) if 'File exists' in e.message: print ' remove local file: ' + qos_file os.remove(qos_file)
def upload_to_hive(listname, hdfs_d, ts, tablename): """ this function will create a partition directory in hdfs with the requisite timestamp. It will then add the partition to the table cl_ns_pp with the appropriate timestamp """ # hdfs_d = config.hdfsclnspp % (ts) # create the partition try: sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d]) # upload the data except sp.CalledProcessError: raise HadoopDirectoryCreateError try: sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d]) except sp.CalledProcessError: raise HadoopDataUploadError # add the partition try: hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(ts=%s);' % (ts) beeline.bln_e(hiveql_str) # sp.check_call(['hive', '-e', hiveql_str]) except sp.CalledProcessError: raise HiveCreatePartitionError
def upload_to_hive(listname, hdfs_d, ts, tablename): """ this function will create a partition directory in hdfs with the requisite timestamp. It will then add the partition to the table cl_ns_pp with the appropriate timestamp """ # hdfs_d = config.hdfsclnspp % (ts) # create the partition try: sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d]) # upload the data except sp.CalledProcessError: raise HadoopDirectoryCreateError try: sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d]) except sp.CalledProcessError: raise HadoopDataUploadError # add the partition try: hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(ts=%s);' % ( ts) beeline.bln_e(hiveql_str) # sp.check_call(['hive', '-e', hiveql_str]) except sp.CalledProcessError: raise HiveCreatePartitionError
def mrqos_table_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders score_partitions = hdfsutil.ls(config.hdfs_table_score) str_parts_list = [i.split('=', 1)[1] for i in score_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs timenow = int(time.time()) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] for item in mtype: exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item) str_parts_list = [i.split('=', 1)[1] for i in this_partitions] str_parts_list_int = map(int, str_parts_list) print " ## for table: %s" % item print " ## ", print str_parts_list_int for partition in str_parts_list_int: if partition < timenow - config.mrqos_table_delete: try: print " ## handling table: %s with ts=%s" % ( item, str(partition)) # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str( partition) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed in hive table clean up in table: %s." % item print e.message
def main(): """ this function will do the query on 5 different measurement and upload the data to hdfs accordingly, this also join tables at single time point """ # different queries (various types) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] sql = """sql2 -q map.mapnoccthree.query.akadns.net --csv "`cat """ post = """`" | tail -n+3 | awk -F"," 'BEGIN{OFS=","}{$1=""; print $0}' | sed 's/^,//g' > """ # current time timenow = int(time.time()) print "###################" print "Start processing the data back in for 10 minute joins" print "starting processing time is " + str(timenow) print "###################" # fetch the data through query with retrials print " **** querying mrqos data." for item in mtype: flag = 0 count = 0 dest = os.path.join(config.mrqos_data, item + '.tmp') aggs = os.path.join(config.mrqos_query, item + '.qr') cmd = sql + aggs + post + dest n_retrial = config.query_retrial t_timeout = config.query_timeout # multiple times with timeout scheme while (flag == 0) and (count < n_retrial): try: with ytt.Timeout(t_timeout): sp.call(cmd, shell=True) flag = 1 except: count += 1 # if any of the query not fetched successfully, break all and stop running if count >= n_retrial: print ">> data fetch failed in querying table %s" % item return # provide SCORE table with peak/off-peak attribute print " **** provide PEAK in score." sp.call([config.provide_peak], shell=True) # backup the individual query file by copying to backup folder print " **** backing up queried results." if not os.path.exists('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)): os.makedirs('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)) for item in mtype: filesrc = os.path.join(config.mrqos_data, item + '.tmp') filedst = '/home/testgrp/MRQOS/mrqos_data/backup/%s/' % str(timenow) shutil.copy(filesrc, filedst) # upload to hdfs and link to hive tables print " **** uploading to hdfs and hive." try: # adding the individual query result to hdfs and add hive partitions for item in mtype: listname = os.path.join(config.mrqos_data, item + '.tmp') hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % str(timenow)) upload_to_hive(listname, hdfs_d, str(timenow), item) shutil.rmtree('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)) # new version of the join tables in hive: direct insert # # specify the new joined file in hdfs hdfs_file = os.path.join(config.hdfs_table, 'mrqos_join', 'ts=%s' % str(timenow), '000000_0.deflate') # specify the local copy of the joined file local_file = os.path.join(config.mrqos_data_backup, '000000_0.deflate') try: print " **** direct join and insert into mrqos_join." # direct join and insert in hive f = open('/home/testgrp/MRQOS/MRQOS_table_join2.hive', 'r') strcmd = f.read() strcmd_s = strcmd % (str(timenow), str(timenow), str(timenow), str(timenow), str(timenow), str(timenow)) f.close() print " **** perform beeline for join." beeline.bln_e(strcmd_s) # have the local copy of the joined file print " **** copy the joined file for backup." hdfsutil.get(hdfs_file, local_file) except sp.CalledProcessError as e: print ">> direct join and insert failed, trying to copy the last succeeded one" print e.message try: # upload the last succeeded one from local print " **** copying backups from local to hdfs" hdfsutil.put(local_file, hdfs_file) try: # using hive to add partitions to joined query results print " **** adding hive partitions" hiveql_str = 'use mrqos; alter table mrqos_join add partition(ts=%s);' % str(timenow) beeline.bln_e(hiveql_str) except sp.CalledProcessError as e: print ">> copying from duplicated file for mrqos_join failed in adding partitions" print e.message #raise HiveCreatePartitionError except: print "copying from duplicated file for mrqos_join failed in uploading to hdfs" except: print "HDFS upload failed, backup file retains" # clear the expired data in mrqos_table print " **** clean up mrqos individual table." mrqos_table_cleanup() # clear the expired data in mrqos_join print " **** clean up mrqos joined table." mrqos_join_cleanup()
def main(): # logging set-up logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_join.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting ts = int(time.time()) logger.info('########### ts=%s ###########' % str(ts)) #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts))) #hourstamp = time.strftime('%H', time.gmtime(float(ts))) # IO-Ratio Join: last_mrqos_region_partition = beeline.get_last_partitions('mrqos.mrqos_region') [datestamp, hourstamp, ts_region] = [x.split('=')[1] for x in last_mrqos_region_partition.split('/')] logger.info('MRQOS mrqos_region partition: datestamp=%s, hour=%s, ts_region=%s' % (datestamp, hourstamp, ts_region)) mapruleinfo_partitions = [x for x in sorted(beeline.show_partitions('mrqos.maprule_info').split('\n'),reverse=True) if '=' in x] mapruleinfo_partitions = [x for x in mapruleinfo_partitions if x < 'ts=%s' % ts_region] ts_mapruleinfo = mapruleinfo_partitions[0].split('=')[1] logger.info('MRQOS maprule_info partition: ts_mapruleinfo=%s' % ts_mapruleinfo) region_summary_retrial_max = 10 # ############################### # # The In-Out Ratio hive procedure # # ############################### # # check if the summary has been performed on this particular hour (last hour) # print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_table, 'mrqos_ioratio', 'datestamp=%s' % datestamp, 'hour=%s' % hourstamp, 'ts=%s' % ts_region, '000000_0.deflate')): logger.info(' Joined file not exist.') f = open(os.path.join(config.mrqos_hive_query, 'mrqos_ioratio.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, ts_region, datestamp, hourstamp, ts_region, ts_mapruleinfo) print strcmd_s f.close() # strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp) # query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) logger.info(' ****** success with time cost = %s.' % str(time.time()-tic)) break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.error(' ****** failed with time cost = %s upto # retrials=%s' % (str(time.time()-tic), str(count_retrial))) logger.error('error %s' % e.message) hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_ioratio', 'datestamp=%s' % datestamp, 'hour=%s' % hourstamp, 'ts=%s' % ts_region), r=True) count_retrial += 1 else: logger.info(' Joined file exists.')
def main(argv): """ get the date and hour for the specified day and hour. Clean(drop) and rebuild the table partition. """ try: opts, args = getopt.getopt(argv,"qd:h:",["datestamp=","hour="]) except getopt.GetoptError: print 'region_summary_hour_repair.py -d <datestamp> -h <hour>' sys.exit(2) hour ='' datestamp = '' for opt, arg in opts: if opt == '-q': print 'region_summary_hour_repair.py -d <datestamp> -h <hour>' sys.exit() elif opt in ("-d", "--datestamp"): datestamp = arg elif opt in ("-h", "--hour"): hour = arg ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the repair of the mrqos_region summary" print "# starting processing time is " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts)) print "###################" if (not datestamp and not hour): print 'region_summary_hour_repair.py -d <datestamp> -h <hour>' sys.exit(2) print 'Fixing datestamp = %s' % datestamp if not hour: hour_list = [str("%02d" % x) for x in range(24)] print 'Fixing hour = %s' % hour_list else: print 'Fixing hour = %s' % hour #ts_last_hour = ts-3600 #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) #hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) #hour_list = [str("%02d" % x) for x in range(24)] region_summary_retrial_max = 10 print " #**** first perform table cleanups: " if not hour: for hourstamp in hour_list: cleanup_mrqos_region_related_tables(datestamp, hourstamp) else: hourstamp = hour cleanup_mrqos_region_related_tables(datestamp, hourstamp) print " #**** rebuild the db / table: " if not hour: for hourstamp in hour_list: # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # print " **** summary hour tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################ # # The CASE VIEW hive procedure # # ############################ # print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################## # # The REGION VIEW hive procedure # # ############################## # print " **** region view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." else: # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # print " **** summary hour tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################ # # The CASE VIEW hive procedure # # ############################ # print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################## # # The REGION VIEW hive procedure # # ############################## # print " **** region view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists."
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) + " = " + time.strftime( 'GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts)) print "###################" ts_last_hour = ts - 3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) #hour_list = [str("%02d" % x) for x in range(24)] region_summary_retrial_max = 10 # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # print " **** summary hour tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file( os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open( os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str( time.time() - tic) #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto # retrials=%s" % ( str(time.time() - tic), str(count_retrial)) print e.message hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################ # # The CASE VIEW hive procedure # # ############################ # print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file( os.path.join( config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open( os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str( time.time() - tic) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: print " **** copy to local failed, retry!" print e.message beeline.bln_e_output(strcmd_g, query_result_file) break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % ( str(time.time() - tic), str(count_retrial)) print e.message hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################## # # The REGION VIEW hive procedure # # ############################## # print " **** region view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file( os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open( os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str( time.time() - tic) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: print " **** copy to local failed, retry!" print e.message beeline.bln_e_output(strcmd_g, query_result_file) break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % ( str(time.time() - tic), str(count_retrial)) print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists."
def main(): """ this function will do the query on 5 different measurement and upload the data to hdfs accordingly, this also join tables at single time point """ # different queries (various types) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] sql = """sql2 -q map.mapnoccthree.query.akadns.net --csv "`cat """ post = """`" | tail -n+3 | awk -F"," 'BEGIN{OFS=","}{$1=""; print $0}' | sed 's/^,//g' > """ # current time timenow = int(time.time()) print "###################" print "Start processing the data back in for 10 minute joins" print "starting processing time is " + str(timenow) print "###################" # fetch the data through query with retrials print " **** querying mrqos data." for item in mtype: flag = 0 count = 0 dest = os.path.join(config.mrqos_data, item + '.tmp') aggs = os.path.join(config.mrqos_query, item + '.qr') cmd = sql + aggs + post + dest n_retrial = config.query_retrial t_timeout = config.query_timeout # multiple times with timeout scheme while (flag == 0) and (count < n_retrial): try: with ytt.Timeout(t_timeout): sp.call(cmd, shell=True) flag = 1 except: count += 1 # if any of the query not fetched successfully, break all and stop running if count >= n_retrial: print ">> data fetch failed in querying table %s" % item return # provide SCORE table with peak/off-peak attribute print " **** provide PEAK in score." sp.call([config.provide_peak], shell=True) # backup the individual query file by copying to backup folder print " **** backing up queried results." if not os.path.exists( '/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)): os.makedirs('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)) for item in mtype: filesrc = os.path.join(config.mrqos_data, item + '.tmp') filedst = '/home/testgrp/MRQOS/mrqos_data/backup/%s/' % str( timenow) shutil.copy(filesrc, filedst) # upload to hdfs and link to hive tables print " **** uploading to hdfs and hive." try: # adding the individual query result to hdfs and add hive partitions for item in mtype: listname = os.path.join(config.mrqos_data, item + '.tmp') hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % str(timenow)) upload_to_hive(listname, hdfs_d, str(timenow), item) shutil.rmtree('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)) # new version of the join tables in hive: direct insert # # specify the new joined file in hdfs hdfs_file = os.path.join(config.hdfs_table, 'mrqos_join', 'ts=%s' % str(timenow), '000000_0.deflate') # specify the local copy of the joined file local_file = os.path.join(config.mrqos_data_backup, '000000_0.deflate') try: print " **** direct join and insert into mrqos_join." # direct join and insert in hive f = open('/home/testgrp/MRQOS/MRQOS_table_join2.hive', 'r') strcmd = f.read() strcmd_s = strcmd % (str(timenow), str(timenow), str(timenow), str(timenow), str(timenow), str(timenow)) f.close() print " **** perform beeline for join." beeline.bln_e(strcmd_s) # have the local copy of the joined file print " **** copy the joined file for backup." hdfsutil.get(hdfs_file, local_file) except sp.CalledProcessError as e: print ">> direct join and insert failed, trying to copy the last succeeded one" print e.message try: # upload the last succeeded one from local print " **** copying backups from local to hdfs" hdfsutil.put(local_file, hdfs_file) try: # using hive to add partitions to joined query results print " **** adding hive partitions" hiveql_str = 'use mrqos; alter table mrqos_join add partition(ts=%s);' % str( timenow) beeline.bln_e(hiveql_str) except sp.CalledProcessError as e: print ">> copying from duplicated file for mrqos_join failed in adding partitions" print e.message #raise HiveCreatePartitionError except: print "copying from duplicated file for mrqos_join failed in uploading to hdfs" except: print "HDFS upload failed, backup file retains" # clear the expired data in mrqos_table print " **** clean up mrqos individual table." mrqos_table_cleanup() # clear the expired data in mrqos_join print " **** clean up mrqos joined table." mrqos_join_cleanup()
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'cron_region_summary_hour.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # start the logging logger.info("###################") logger.info("# Performing the hourly mrqos_region summary") logger.info("# starting time: " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts))) logger.info("###################") # parameter: backfilter length bf_length = config.region_summary_back_filling ts_last_couple_hour_list = [ts-(1+x)*3600 for x in range(bf_length)] for ts_last_hour in ts_last_couple_hour_list: datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) region_summary_retrial_max = 10 # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # #logger.info(" **** summary hour tour: checking day = %s, hour = %s." % (datestamp, hourstamp)) # check if the summary has been performed on this particular hour (last hour) if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): logger.info("** region summary hour: checking day = %s, hour = %s, and file does not exist." % (datestamp, hourstamp)) f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) logger.info("BLN region summary hour success @ cost = %s sec." % str(time.time()-tic)) break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.info("BLN region summary hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic), str(count_retrial))) logger.exception("message") hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: logger.info("** region summary hour: checking day = %s, hour = %s, and file exists." % (datestamp, hourstamp)) # ############################ # # The CASE VIEW hive procedure # # ############################ # #print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): logger.info("** case view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp, hourstamp)) f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) logger.info("BLN case view hour success @ cost = %s sec." % str(time.time()-tic)) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.warning("copy to local failed, retrying...") print e.message try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.error("copy to local failed again, abort.") logger.exception("message") break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.info("BLN case view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic), str(count_retrial))) logger.exception("message") hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: logger.info("** case view hour: checking day = %s, hour = %s, and file exists." % (datestamp, hourstamp)) # ############################## # # The REGION VIEW hive procedure # # ############################## # # check if the summary has been performed on this particular hour (last hour) if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): logger.info("** region view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp, hourstamp)) f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) logger.info("BLN region view hour success @ cost = %s sec." % str(time.time()-tic)) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.warning("copy to local failed, retrying...") print e.message try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.error("copy to local failed again, abort.") logger.exception("message") break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.info("BLN region view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic), str(count_retrial))) logger.exception("message") hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: logger.info("** region view hour: checking day = %s, hour = %s, and file exists." % (datestamp, hourstamp))
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts - 3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] hour_list = [x for x in hour_list if x <= hourstamp] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) folders_day = '/'.join( str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1]) # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one if hdfsutil.test_dic(folders_day): hdfsutil.mkdir(folders_day) folders_in = [folders_day + '/hour=%s' % x for x in hour_list] folders_out = hdfsutil.ls(folders_day) folders_missing = [x for x in folders_in if x not in folders_out] folders_missing.sort(reverse=True) for item in folders_missing: hourstamp = item[-2:] print " **** missing data for day = %s, hour = %s." % (datestamp, hourstamp), f = open( os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % ( datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " **** summarization failed upto #retrials=" + str( count_retrial) print " **** ", print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1