def mrqos_table_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders score_partitions = hdfsutil.ls(config.hdfs_table_score) str_parts_list = [i.split('=', 1)[1] for i in score_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs timenow = int(time.time()) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] for item in mtype: exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item) str_parts_list = [i.split('=', 1)[1] for i in this_partitions] str_parts_list_int = map(int, str_parts_list) print " ## for table: %s" % item print " ## ", print str_parts_list_int for partition in str_parts_list_int: if partition < timenow - config.mrqos_table_delete: try: print " ## handling table: %s with ts=%s" % (item, str(partition)) # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(partition) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed in hive table clean up in table: %s." % item print e.message
def mrqos_join_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders joined_partitions = hdfsutil.ls(config.hdfs_table_join) str_parts_list = [i.split('=', 1)[1] for i in joined_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold timenow = int(time.time()) # get the list of retired data in HDFS using hive partitions try: hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\ if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)] try: # drop the partitions in hive beeline.drop_partitions('mrqos.mrqos_join', 'ts<%s' % str(timenow-config.mrqos_join_delete)) print " drop partitions successful. " # remove the hdfs folders for partition_id in hdfs_remove_list: try: hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join', '%s' % str(partition_id)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed to remove HDFS folder for mrqos_join at partition folder %s" % str(partition_id) print " remove HDFS successful. " except sp.CalledProcessError as e: print ">> failed to drop partitions" except sp.CalledProcessError as e: print ">> failed to obtain retire partition list (HIVE)" print e.message
def mrqos_join_cleanupv2(logger): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders joined_partitions = hdfsutil.ls(config.hdfs_table_join2) str_parts_list = [i.split('=', 1)[1] for i in joined_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold timenow = int(time.time()) # get the list of retired data in HDFS using hive partitions try: hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\ if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)] try: # drop the partitions in hive beeline.drop_partitions('mrqos.mrqos_join2', 'ts<%s' % str(timenow-config.mrqos_join_delete)) logger.info("drop hive partitions successful. ") # remove the hdfs folders for partition_id in hdfs_remove_list: try: hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join2', '%s' % str(partition_id)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: logger.info('failed to remove HDFS folder for mrqos_join at partition folder %s' % str(partition_id)) logger.info('remove HDFS successful. ') except sp.CalledProcessError as e: logger.error('failed to drop partitions. ') except sp.CalledProcessError as e: logger.error('failed to obtain retire partition list (HIVE)') logger.error('error message: %s' % e.message)
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts-3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] hour_list = [x for x in hour_list if x <= hourstamp] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) folders_day = '/'.join(str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1]) # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one if hdfsutil.test_dic(folders_day): hdfsutil.mkdir(folders_day) folders_in = [folders_day+'/hour=%s' % x for x in hour_list] folders_out = hdfsutil.ls(folders_day) folders_missing = [x for x in folders_in if x not in folders_out] folders_missing.sort(reverse=True) for item in folders_missing: hourstamp = item[-2:] print " **** missing data for day = %s, hour = %s." % (datestamp, hourstamp), f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " **** summarization failed upto #retrials="+str(count_retrial) print " **** ", print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1
def main(): # initialze the logger logging.basicConfig( filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) timenow = int(time.time()) datenow = str(datetime.date.today() - datetime.timedelta(1)) date_idx = datenow[0:4] + datenow[5:7] + datenow[8:10] # get the latest barebone day_idx bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1] logger.info("barebone index: day={}".format(bb_day_idx)) # get the latest mpd yesterday uuid_list = [ x.split('=')[-1] for x in hdfsutil.ls( os.path.join(os.path.dirname(config.hdfs_table), 'mapper', 'mapmon', 'day={}'.format(date_idx))) ] for uuid_idx in uuid_list: logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx)) file_location = os.path.join(config.hdfs_table, 'mapmon_sum', 'day={}'.format(date_idx), 'mpd_uuid={}'.format(uuid_idx)) if hdfsutil.test_dic(file_location): logger.info('creating folder: {}'.format(file_location)) hdfsutil.mkdir(file_location) if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')): f = open( os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx, uuid_idx, date_idx, uuid_idx) f.close() try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. logger.warn("summarization failed, removing hdfs folder.") hdfsutil.rm(file_location, r=True) else: logger.info(" file exists.")
def main(): # initialze the logger logging.basicConfig(filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) timenow = int(time.time()) datenow = str(datetime.date.today()-datetime.timedelta(1)) date_idx = datenow[0:4]+datenow[5:7]+datenow[8:10] # get the latest barebone day_idx bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1] logger.info("barebone index: day={}".format(bb_day_idx)) # get the latest mpd yesterday uuid_list = [x.split('=')[-1] for x in hdfsutil.ls(os.path.join(os.path.dirname(config.hdfs_table),'mapper','mapmon','day={}'.format(date_idx)))] for uuid_idx in uuid_list: logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx)) file_location = os.path.join(config.hdfs_table, 'mapmon_sum', 'day={}'.format(date_idx), 'mpd_uuid={}'.format(uuid_idx)) if hdfsutil.test_dic(file_location): logger.info('creating folder: {}'.format(file_location)) hdfsutil.mkdir(file_location) if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx, uuid_idx, date_idx, uuid_idx) f.close() try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. logger.warn("summarization failed, removing hdfs folder.") hdfsutil.rm(file_location, r=True) else: logger.info(" file exists.")
def mrqos_table_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders score_partitions = hdfsutil.ls(config.hdfs_table_score) str_parts_list = [i.split('=', 1)[1] for i in score_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs timenow = int(time.time()) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] for item in mtype: exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item) str_parts_list = [i.split('=', 1)[1] for i in this_partitions] str_parts_list_int = map(int, str_parts_list) print " ## for table: %s" % item print " ## ", print str_parts_list_int for partition in str_parts_list_int: if partition < timenow - config.mrqos_table_delete: try: print " ## handling table: %s with ts=%s" % ( item, str(partition)) # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str( partition) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed in hive table clean up in table: %s." % item print e.message
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts - 3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] hour_list = [x for x in hour_list if x <= hourstamp] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) folders_day = '/'.join( str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1]) # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one if hdfsutil.test_dic(folders_day): hdfsutil.mkdir(folders_day) folders_in = [folders_day + '/hour=%s' % x for x in hour_list] folders_out = hdfsutil.ls(folders_day) folders_missing = [x for x in folders_in if x not in folders_out] folders_missing.sort(reverse=True) for item in folders_missing: hourstamp = item[-2:] print " **** missing data for day = %s, hour = %s." % (datestamp, hourstamp), f = open( os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % ( datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " **** summarization failed upto #retrials=" + str( count_retrial) print " **** ", print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1