def mrqos_join_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders joined_partitions = hdfsutil.ls(config.hdfs_table_join) str_parts_list = [i.split('=', 1)[1] for i in joined_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold timenow = int(time.time()) # get the list of retired data in HDFS using hive partitions try: hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\ if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)] try: # drop the partitions in hive beeline.drop_partitions('mrqos.mrqos_join', 'ts<%s' % str(timenow-config.mrqos_join_delete)) print " drop partitions successful. " # remove the hdfs folders for partition_id in hdfs_remove_list: try: hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join', '%s' % str(partition_id)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed to remove HDFS folder for mrqos_join at partition folder %s" % str(partition_id) print " remove HDFS successful. " except sp.CalledProcessError as e: print ">> failed to drop partitions" except sp.CalledProcessError as e: print ">> failed to obtain retire partition list (HIVE)" print e.message
def mrqos_join_cleanupv2(logger): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders joined_partitions = hdfsutil.ls(config.hdfs_table_join2) str_parts_list = [i.split('=', 1)[1] for i in joined_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold timenow = int(time.time()) # get the list of retired data in HDFS using hive partitions try: hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\ if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)] try: # drop the partitions in hive beeline.drop_partitions('mrqos.mrqos_join2', 'ts<%s' % str(timenow-config.mrqos_join_delete)) logger.info("drop hive partitions successful. ") # remove the hdfs folders for partition_id in hdfs_remove_list: try: hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join2', '%s' % str(partition_id)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: logger.info('failed to remove HDFS folder for mrqos_join at partition folder %s' % str(partition_id)) logger.info('remove HDFS successful. ') except sp.CalledProcessError as e: logger.error('failed to drop partitions. ') except sp.CalledProcessError as e: logger.error('failed to obtain retire partition list (HIVE)') logger.error('error message: %s' % e.message)
def main(): """ this function will do the query on 5 different measurement and upload the data to hdfs accordingly, this also join tables at single time point """ # different queries (various types) # logging set-up logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mrqos_sum_comparison.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting # ############################## n_retrial = config.query_retrial day_in_seconds = 86400 list_of_partitions = [x.split('=')[-1] for x in beeline.show_partitions('mrqos.mrqos_sum').split('\n') if '=' in x] ts_now = list_of_partitions[-1] ts_ex_14d = time.strftime('%Y%m%d', time.gmtime(time.mktime(time.strptime(ts_now, '%Y%m%d')) - 14 * day_in_seconds)) ts_14d = [x for x in list_of_partitions if x <= ts_ex_14d][-1] ts_ex_28d = time.strftime('%Y%m%d', time.gmtime(time.mktime(time.strptime(ts_now, '%Y%m%d')) - 28 * day_in_seconds)) ts_28d = [x for x in list_of_partitions if x <= ts_ex_28d][-1] ts_ex_3d = time.strftime('%Y%m%d', time.gmtime(time.mktime(time.strptime(ts_now, '%Y%m%d')) - 3 * day_in_seconds)) ts_3d = [x for x in list_of_partitions if x <= ts_ex_3d][-1] #content = '''beeline.bln_e_output(qry0 % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full.tmp')) ''' my_retrial(id='2W summary (no load)', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d) #content = '''beeline.bln_e_output(qry % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full_wloads.tmp')) ''' my_retrial(id='2W summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d) #content = '''beeline.bln_e_output(qry % (ts_now, ts_28d), os.path.join(config.mrqos_data, 'processed_4wjoin_full_wloads.tmp')) ''' my_retrial(id='4W summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_28d) #content = '''beeline.bln_e_output(qry % (ts_now, ts_3d), os.path.join(config.mrqos_data, 'processed_3djoin_full_wloads.tmp')) ''' my_retrial(id='3D summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_3d) # new summary (with in-out-ratio) my_retrial(id='3Dn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_3d) my_retrial(id='2Wn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d) my_retrial(id='4Wn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_28d)
def main(): # set up the logger logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # NSJOIN dayidx # only partitioned by DAY day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1] # BAREBONES dayidx # only partitioned by DAY day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x] # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid) mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/') [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data] if day_idx != day_mps: logger.error('mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.') return if len(day_bb) == 0: logger.warning('mapper.barebone data missing for this particular day.') #return logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx)) logger.info('begin spark process.') getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx) geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx sc = SparkContext() hiveCtx = HiveContext(sc) rows = hiveCtx.sql(getting_mappoint_data) regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx)) geo_total_cap = hiveCtx.sql(geo_total_cap_query) geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query) # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]] region_mpginfo_pair = rows.map(lambda x: [[x.mpgid, x.lat, x.lon, x.country, x.mpgload, x.asnum, x.ip], x.allowed_private_regions])\ .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]]) #region_mpginfo_pair.first() # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]] # ps. prp=1: private, prp=0: public region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude, x.longitude, x.peak_bitcap_mbps, x.peak_flitcap_mfps, x.country, x.numvips, 'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'), 1 if x.prp=='private' else 0]])\ .filter(lambda x: x[1][6]=='W' or x[1][6]=='S') region_public_list = region_latlon\ .filter(lambda x: x[1][7] == 0)\ .map(lambda x: ('all', [[x[0]]]))\ .reduceByKey(lambda a, b: [a[0]+b[0]])\ .map(lambda x: x[1][0]).collect() region_public_list = [0] + sorted(region_public_list[0]) # dummy region rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])]) region_latlon = region_latlon.union(rdd2) # perform the join into tuple of (K, (V1, V2): # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service])) # rdd = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance], # reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services, # mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon) mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [x[1][0][0], x[0], geodesic_distance(x[1][0][1], x[1][0][2], x[1][1][0], x[1][1][1]), round(float(x[1][1][2])/1000.0, 3), round(float(x[1][1][3])/1000.0, 3), x[1][1][4], # reg-country x[1][1][5], # reg-numvips x[1][1][6], # reg-services x[1][0][3], x[1][0][4], x[1][0][5], x[1][0][6], x[1][0][1], x[1][0][2]]) # filtering on mapping distance < 500 miles # filtering on reg-country = mpg-country # filtering on region capacity fbps > 1Gbps # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon]) #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\ # .filter(lambda x: x[5] == x[8])\ # .filter(lambda x: x[3] > 1)\ # .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) # or this one, no-same-country constraint: mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\ .filter(lambda x: x[3] > 1)\ .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) #mpgid_reg_distance.first() # group by mpgid # rdd format = (mpgid, [[reg-list], # avg_distance, # total_cap freeflow, # total_cap essl, # total num vips, # rg_count, # mpg-country, # mpg-load, # [mpg-asnum], # [mpg-nsip]) mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4], a[5], a[6], a[7], a[8], a[9], a[10]])\ .map(lambda x: (x[0], [sorted(x[1][0]), # region_list round(x[1][1]/x[1][4], 2), # avg distance round(x[1][2][0], 2), # total capacity - w round(x[1][2][1], 2), # total capacity - s x[1][3], # numvips x[1][4], # total region count x[1][5], # mpg country x[1][6], # mpg load x[1][7], # mpg asnum x[1][8], # mpg nsip x[1][9], # mpg lat x[1][10]])) # mpg lon # disable the count #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count() # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips # reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]]) reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\ .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list x[1][1], # avg_distance x[1][2], # region total capacity freeflow x[1][3], # region total capacity essl x[1][4], # total num vips x[1][5], # total region count [x[1][6]], # mpg country list x[1][7], # mpg load 1, # mpg-count x[1][8] if x[1][8] else [], # [mpg-asnum] x[1][9] if x[1][9] else [], # [mpg-nsip] [x[1][10]], # [mpg-lat] # single element array [x[1][11]], # [mpg-lon] # single element array [x[1][7]] # [mpg-load] # single element array ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1], a[2], a[3], a[4], a[5], a[6]+b[6], a[7]+b[7], a[8]+b[8], a[9]+b[9], a[10]+b[10], a[11]+b[11], a[12]+b[12], a[13]+b[13]])\ .filter(lambda x: sum(x[1][13]) > 0.0001)\ .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list x[1][7], # mpg-load x[1][8], # mpg-count [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum] [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip] geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma] ]))\ .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list] x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count x[1][7], # mpg-load x[1][8], # mpg-count ':'.join([str(y) for y in x[0]]), # [region-list] ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list] ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum] ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip] x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma] ], region_public_list ))\ .flatMapValues(lambda x: x)\ .map(lambda x: [x[1], x[0]]) reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\ .join(region_latlon)\ .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1])] + [x[0]] if x[0] > 0\ else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1], 0.0]] + [x[0]])\ .filter(lambda x: x[2][4] < 500)\ .map(lambda x: (tuple([x[0][0], x[0][1], x[0][2], x[0][3], x[0][4], x[0][5], x[0][6], x[0][7], x[0][8], x[0][9], x[0][10], x[0][11], x[0][12][0], x[0][12][1], x[0][12][2], x[0][12][3]]), # mpg-information [x[1][2], # pub.region.cap.ff x[1][3], # pub.region.cap.essl x[1][5], # pub.region.vip [x[3]] # single element region id ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff ) a[1]+b[1], # sum( pub.region.cap.essl ) a[2]+b[2], # sum( pub.region.cap.vip ) a[3]+b[3] # [pub.regions] ])\ .map(lambda x: [x[0][0], # [mpg-country-list] x[0][1], # avg-distance x[0][12], # mpg-lat x[0][13], # mpg-lon x[0][14], # mpg-por x[0][15], # mpg-porsigma x[0][2], # pri.region.cap.ff (gbps) x[0][3], # pri.region.cap.essl (gbps) x[0][4], # pri.vips x[0][5], # pri.region.count round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps) round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps) x[1][2], # pub.vips len(x[1][3])-1, # pub.region.count x[0][6], # mpg-load round(x[0][7], 6), # mpg-count x[0][8], # [pri reg-list] ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list]) x[0][9], # [mpg-list] x[0][10], # [mpg-assum] x[0][11] # [mpg-nsip] ]) # data exporting to local country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame(columns=['cl_geoname', 'cl_avgDistance', 'cl_lat', 'cl_lon', 'cl_por', 'cl_porsigma', 'pri_cap_ff_gbps', 'pri_cap_essl_gbps', 'pri_nvips', 'pri_nReg', 'pub_cap_ff_gbps', 'pub_cap_essl_gbps', 'pub_nvips', 'pub_nReg', 'cl_mpgLoad', 'cl_nMpg', 'pri_regList', 'pub_regList', 'mpgList', 'mpgASList', 'mpgNSIPList']) geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect() logger.info('begin write to local disk.') for item in range(len(geo_cluster_full_info)): temp = geo_cluster_full_info[item] country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[item] = temp # the above should be temp[1][0] for the mpglist data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data' filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx) fileDestination = os.path.join(data_folder, filename) country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv(fileDestination, sep=',', index=False, header=False) logger.info('begin to upload to hdfs.') tablename = 'mrqos.mpg_cluster' hdfs_d = os.path.join(config.hdfs_table, 'mpg_cluster', 'datestamp=%s' % day_idx, 'uuid=%s' % uuid_idx) partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx) processed_filename = '.'.join(filename.split('.')[0:-1])+'.processed.csv' cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % (os.path.join(data_folder, filename), os.path.join(data_folder, processed_filename)) sp.check_call(cmd_str, shell=True) try: beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename, logger) # os.remove(fileDestination) except sp.CalledProcessError as e: logger.info('upload to HDFS + update Hive table failed.')
def main(): # logging set-up logging.basicConfig( filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting # ############################## ts = int(time.time()) ts_timeout = ts - config.mrqos_table_delete * 24 * 3 # 3 days = (24*3) hours of time-out date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout))) # hourstamp = time.strftime('%H', time.gmtime(float(ts))) list_to_clean = sorted( list( set([ x.split('/')[0] for x in beeline.show_partitions( 'mrqos.mrqos_region').split('\n') ]))) list_to_clean = [ x for x in list_to_clean if ('=' in x and x.split('=')[1] < date_timeout) ] logger.info('handling table: mrqos_region') try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_region', '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: datestamp<%s' % str(date_timeout)) beeline.drop_partitions(tablename='mrqos.mrqos_region', condition='datestamp<%s' % str(date_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message) # ############################## # target table: maprule_info, mcm_machines # ############################## query_item = ['maprule_info', 'mcm_machines'] for scan in query_item: logger.info('handling table: %s' % scan) list_to_clean = sorted( list( set([ x.split('/')[0] for x in beeline.show_partitions('mrqos.%s' % scan).split('\n') ]))) list_to_clean = [ x for x in list_to_clean if ('=' in x and int(x.split('=')[1]) < ts_timeout) ] try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, '%s' % scan, '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: ts<%s' % str(ts_timeout)) beeline.drop_partitions(tablename='mrqos.%s' % scan, condition='ts<%s' % str(ts_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message)
def main(): # set up the logger logging.basicConfig( filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # NSJOIN dayidx # only partitioned by DAY day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1] # BAREBONES dayidx # only partitioned by DAY day_bb = [ x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x ] # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid) mappoints_data = sorted([ x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x ])[-1].split('/') [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data] if day_idx != day_mps: logger.error( 'mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.' ) return if len(day_bb) == 0: logger.warning('mapper.barebone data missing for this particular day.') #return logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx)) logger.info('begin spark process.') getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % ( day_idx, uuid_idx, day_idx, uuid_idx) geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx sc = SparkContext() hiveCtx = HiveContext(sc) rows = hiveCtx.sql(getting_mappoint_data) regInfoRows = hiveCtx.sql( 'select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx)) geo_total_cap = hiveCtx.sql(geo_total_cap_query) geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query) # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]] region_mpginfo_pair = rows.map(lambda x: [[x.mpgid, x.lat, x.lon, x.country, x.mpgload, x.asnum, x.ip], x.allowed_private_regions])\ .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]]) #region_mpginfo_pair.first() # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]] # ps. prp=1: private, prp=0: public region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude, x.longitude, x.peak_bitcap_mbps, x.peak_flitcap_mfps, x.country, x.numvips, 'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'), 1 if x.prp=='private' else 0]])\ .filter(lambda x: x[1][6]=='W' or x[1][6]=='S') region_public_list = region_latlon\ .filter(lambda x: x[1][7] == 0)\ .map(lambda x: ('all', [[x[0]]]))\ .reduceByKey(lambda a, b: [a[0]+b[0]])\ .map(lambda x: x[1][0]).collect() region_public_list = [0] + sorted(region_public_list[0]) # dummy region rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])]) region_latlon = region_latlon.union(rdd2) # perform the join into tuple of (K, (V1, V2): # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service])) # rdd = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance], # reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services, # mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon) mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [ x[1][0][0], x[0], geodesic_distance(x[1][0][1], x[1][0][2], x[1][1][0], x[1][1][1]), round(float(x[1][1][2]) / 1000.0, 3), round(float(x[1][1][3]) / 1000.0, 3), x[1][1][4], # reg-country x[1][1][5], # reg-numvips x[1][1][6], # reg-services x[1][0][3], x[1][0][4], x[1][0][5], x[1][0][6], x[1][0][1], x[1][0][2] ]) # filtering on mapping distance < 500 miles # filtering on reg-country = mpg-country # filtering on region capacity fbps > 1Gbps # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon]) #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\ # .filter(lambda x: x[5] == x[8])\ # .filter(lambda x: x[3] > 1)\ # .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) # or this one, no-same-country constraint: mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\ .filter(lambda x: x[3] > 1)\ .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) #mpgid_reg_distance.first() # group by mpgid # rdd format = (mpgid, [[reg-list], # avg_distance, # total_cap freeflow, # total_cap essl, # total num vips, # rg_count, # mpg-country, # mpg-load, # [mpg-asnum], # [mpg-nsip]) mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4], a[5], a[6], a[7], a[8], a[9], a[10]])\ .map(lambda x: (x[0], [sorted(x[1][0]), # region_list round(x[1][1]/x[1][4], 2), # avg distance round(x[1][2][0], 2), # total capacity - w round(x[1][2][1], 2), # total capacity - s x[1][3], # numvips x[1][4], # total region count x[1][5], # mpg country x[1][6], # mpg load x[1][7], # mpg asnum x[1][8], # mpg nsip x[1][9], # mpg lat x[1][10]])) # mpg lon # disable the count #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count() # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips # reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]]) reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\ .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list x[1][1], # avg_distance x[1][2], # region total capacity freeflow x[1][3], # region total capacity essl x[1][4], # total num vips x[1][5], # total region count [x[1][6]], # mpg country list x[1][7], # mpg load 1, # mpg-count x[1][8] if x[1][8] else [], # [mpg-asnum] x[1][9] if x[1][9] else [], # [mpg-nsip] [x[1][10]], # [mpg-lat] # single element array [x[1][11]], # [mpg-lon] # single element array [x[1][7]] # [mpg-load] # single element array ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1], a[2], a[3], a[4], a[5], a[6]+b[6], a[7]+b[7], a[8]+b[8], a[9]+b[9], a[10]+b[10], a[11]+b[11], a[12]+b[12], a[13]+b[13]])\ .filter(lambda x: sum(x[1][13]) > 0.0001)\ .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list x[1][7], # mpg-load x[1][8], # mpg-count [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum] [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip] geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma] ]))\ .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list] x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count x[1][7], # mpg-load x[1][8], # mpg-count ':'.join([str(y) for y in x[0]]), # [region-list] ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list] ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum] ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip] x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma] ], region_public_list ))\ .flatMapValues(lambda x: x)\ .map(lambda x: [x[1], x[0]]) reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\ .join(region_latlon)\ .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1])] + [x[0]] if x[0] > 0\ else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1], 0.0]] + [x[0]])\ .filter(lambda x: x[2][4] < 500)\ .map(lambda x: (tuple([x[0][0], x[0][1], x[0][2], x[0][3], x[0][4], x[0][5], x[0][6], x[0][7], x[0][8], x[0][9], x[0][10], x[0][11], x[0][12][0], x[0][12][1], x[0][12][2], x[0][12][3]]), # mpg-information [x[1][2], # pub.region.cap.ff x[1][3], # pub.region.cap.essl x[1][5], # pub.region.vip [x[3]] # single element region id ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff ) a[1]+b[1], # sum( pub.region.cap.essl ) a[2]+b[2], # sum( pub.region.cap.vip ) a[3]+b[3] # [pub.regions] ])\ .map(lambda x: [x[0][0], # [mpg-country-list] x[0][1], # avg-distance x[0][12], # mpg-lat x[0][13], # mpg-lon x[0][14], # mpg-por x[0][15], # mpg-porsigma x[0][2], # pri.region.cap.ff (gbps) x[0][3], # pri.region.cap.essl (gbps) x[0][4], # pri.vips x[0][5], # pri.region.count round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps) round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps) x[1][2], # pub.vips len(x[1][3])-1, # pub.region.count x[0][6], # mpg-load round(x[0][7], 6), # mpg-count x[0][8], # [pri reg-list] ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list]) x[0][9], # [mpg-list] x[0][10], # [mpg-assum] x[0][11] # [mpg-nsip] ]) # data exporting to local country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame( columns=[ 'cl_geoname', 'cl_avgDistance', 'cl_lat', 'cl_lon', 'cl_por', 'cl_porsigma', 'pri_cap_ff_gbps', 'pri_cap_essl_gbps', 'pri_nvips', 'pri_nReg', 'pub_cap_ff_gbps', 'pub_cap_essl_gbps', 'pub_nvips', 'pub_nReg', 'cl_mpgLoad', 'cl_nMpg', 'pri_regList', 'pub_regList', 'mpgList', 'mpgASList', 'mpgNSIPList' ]) geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect( ) logger.info('begin write to local disk.') for item in range(len(geo_cluster_full_info)): temp = geo_cluster_full_info[item] country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[ item] = temp # the above should be temp[1][0] for the mpglist data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data' filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx) fileDestination = os.path.join(data_folder, filename) country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv( fileDestination, sep=',', index=False, header=False) logger.info('begin to upload to hdfs.') tablename = 'mrqos.mpg_cluster' hdfs_d = os.path.join(config.hdfs_table, 'mpg_cluster', 'datestamp=%s' % day_idx, 'uuid=%s' % uuid_idx) partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx) processed_filename = '.'.join(filename.split('.')[0:-1]) + '.processed.csv' cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % ( os.path.join(data_folder, filename), os.path.join(data_folder, processed_filename)) sp.check_call(cmd_str, shell=True) try: beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename, logger) # os.remove(fileDestination) except sp.CalledProcessError as e: logger.info('upload to HDFS + update Hive table failed.')
def main(): # set up the logger logging.basicConfig( filename=os.path.join(config.mrqos_logging, 'ra_summary.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # table nsjoin (day, uuid) # table mapmon (day, uuid) datenow = str(datetime.date.today() - datetime.timedelta(1)) day_idx = datenow[0:4] + datenow[5:7] + datenow[8:10] uuid_list = [ x.split('=')[-1] for x in beeline.show_partitions('mrqos.mapmon_sum').split('\n') if day_idx in x ] sc = SparkContext() hiveCtx = HiveContext(sc) post_partition_n = 1000 for uuid_idx in uuid_list: # ns_ip, demand, asnum ns_asnum, ns_country, ns_continent, ns_lat, ns_lon, ns_mpgid, mpgload nsjoin_query = """ select ns_ip, demand, asnum ns_asnum, country_code ns_country, continent ns_continent, round(latitude,3) ns_lat, round(longitude,3) ns_lon, mpgid ns_mpgid, mpgload from mapper.nsjoin where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL and demand > 1""".format( day_idx, uuid_idx) # mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp mapmon_query = """ select mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp from mrqos.mapmon_sum where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL""".format( day_idx, uuid_idx) logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx)) nsjoin = hiveCtx.sql(nsjoin_query) nsjoin_rows = nsjoin.repartition(post_partition_n).cache() data = hiveCtx.sql(mapmon_query) data_rows = data.repartition(post_partition_n).cache() col = [ 'mpgid', 'mrid', 'mpg_type', 'region', 'link', 'min_s', 'max_s', 'min_r', 'max_r', 'ping', 'local', 'cont_fb', 'mpd_dftime', 'ecor', 'continent', 'country', 'latitude', 'longitude', 'prp', 'ns_ip', 'demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgload' ] cols_appended = [ 'nsip', 'mrid', 'ns_demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgid', 'mpg_type', 'mpg_load', 'regions', 'region_links', 'dftime_ratio', 'ecors', 'list_min_s', 'list_max_s', 'list_min_r', 'list_max_r', 'region_lats', 'region_lons', 'min_s', 'max_s', 'min_r', 'max_r', 'ping_ratio', 'local_ratio', 'cont_fb_ratio', 'in_cont_ratio', 'in_country_ratio', 'private_ratio', 'avg_distance', 'num_region_mapped', 'mapping_entropy', 'sum_dftime' ] df = nsjoin_rows.join(data_rows, data_rows.mpgid == nsjoin_rows.ns_mpgid, 'inner')[col].cache() row1 = data_rows.agg(F.max(data_rows.mpd_dftime)).collect()[0] max_dftime = row1[0] df2 = df.map(lambda x: x + Row(geodesic_distance_weighted(x.ns_lat, x.ns_lon, x.latitude, x.longitude, x.mpd_dftime)))\ .map(lambda x: (( x[19], # nsip x[20], # demand x[21], # ns_asnum x[22], # ns_country x[23], # ns_continent round(x[24], 3), # ns_lat & ns_lon round(x[25], 3), x[0], # mpgid x[1], # mrid x[2], # mpg type x[26], # mpg load ), [ [int(x[3])], # region [str(int(x[3])) + "_" + str(int(x[4]))], # region_link x[5]/max_dftime, # min_s x[6]/max_dftime, # max_s x[7]/max_dftime, # min_r x[8]/max_dftime, # max_r x[9]/max_dftime, # ping ratio x[10]/max_dftime, # local ratio x[11]/max_dftime, # cont_fb ratio [round(x[12]/max_dftime, 3)], # mpd_dftime/max_dftime (time ratio) [int(x[13])], # ecor x[12]/max_dftime * [0, 1][x[14] == x[23]], # mapping in-continent ratio x[12]/max_dftime * [0, 1][x[15] == x[22]], # mapping in-country ratio [round(x[16], 3)], # lat [round(x[17], 3)], # lon x[18]/max_dftime, # prp x[27]/max_dftime, # w_distance x[12], [round(x[5]/x[12], 2)], # min_s list [round(x[6]/x[12], 2)], # max_s list [round(x[7]/x[12], 2)], # min_r list [round(x[8]/x[12], 2)], # max_r list ]))\ .reduceByKey(lambda a, b: [x+y for x, y in zip(a, b)])\ .map(lambda x: [x[0][0], # nsip x[0][8], # mrid x[0][1], # demand x[0][2], # ns_asnum x[0][3], # ns_country x[0][4], # ns_continent x[0][5], # ns_lat x[0][6], # ns_lon x[0][7], # mpgid x[0][9], # mpg type x[0][10], # mpg load x[1][0], # list of region x[1][1], # list of region_link [round(100 * float(y), 2) for y in x[1][9]], # list of covered_record ratio x[1][10], # list of ecor x[1][13], # list of region lat x[1][14], # list of region lon round(x[1][2] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_s round(x[1][3] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_s round(x[1][4] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_r round(x[1][5] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_r round(100 * x[1][6] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # ping ratio round(100 * x[1][7] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # local ratio round(100 * x[1][8] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # cont_fb ratio round(100 * x[1][11] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-continent ratio round(100 * x[1][12] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-country ratio round(100 * x[1][15] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # private ratio round(x[1][16] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # w_distance round(x[1][17], 3), # summation of covered dftime x[1][18], # list of min_s x[1][19], # list of max_s x[1][20], # list of min_r x[1][21], # list of max_r len(x[1][9]), # number of different regions mapped round(computeEntropyPMF(x[1][9]), 6), # entropy of the region assignments ])\ .map(lambda x: x + [[i[0] for i in sorted(enumerate([float(y) for y in x[13]]), key=lambda z:z[1], reverse=True)]])\ .map(lambda x: x[:11] + [':'.join([str(x[11][i]) for i in x[35]]), # list of region ':'.join([str(x[12][i]) for i in x[35]]), # list of region_link ':'.join([str(x[13][i]) for i in x[35]]), # list of covered_record ratio ':'.join([str(x[14][i]) for i in x[35]]), # list of ecor ':'.join([str(x[29][i]) for i in x[35]]), # list of min_s ':'.join([str(x[30][i]) for i in x[35]]), # list of max_s ':'.join([str(x[31][i]) for i in x[35]]), # list of min_r ':'.join([str(x[32][i]) for i in x[35]]), # list of max_r ':'.join([str(x[15][i]) for i in x[35]]), # list of region lat ':'.join([str(x[16][i]) for i in x[35]]), # list of region lon ] + x[17:28] + x[33:35] + [x[28]])\ .toDF(cols_appended).cache() df_all = df2.map(lambda x: toCSVLine(x)) logger.info('writing into HDFS') df_all.saveAsTextFile( '/ghostcache/hadoop/data/MRQOS/mrqos_mapmon_stats/datestamp={}/uuid={}' .format(day_idx, uuid_idx)) logger.info('updating Hive table: mrqos_mapmon_stats') beeline.add_partitions( "mrqos.mrqos_mapmon_stats", "datestamp='{}',uuid='{}'".format(day_idx, uuid_idx))
def main(): """ this function will do the query on 5 different measurement and upload the data to hdfs accordingly, this also join tables at single time point """ # different queries (various types) # logging set-up logging.basicConfig( filename=os.path.join(config.mrqos_logging, 'mrqos_sum_comparison.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting # ############################## n_retrial = config.query_retrial day_in_seconds = 86400 list_of_partitions = [ x.split('=')[-1] for x in beeline.show_partitions('mrqos.mrqos_sum').split('\n') if '=' in x ] ts_now = list_of_partitions[-1] ts_ex_14d = time.strftime( '%Y%m%d', time.gmtime( time.mktime(time.strptime(ts_now, '%Y%m%d')) - 14 * day_in_seconds)) ts_14d = [x for x in list_of_partitions if x <= ts_ex_14d][-1] ts_ex_28d = time.strftime( '%Y%m%d', time.gmtime( time.mktime(time.strptime(ts_now, '%Y%m%d')) - 28 * day_in_seconds)) ts_28d = [x for x in list_of_partitions if x <= ts_ex_28d][-1] ts_ex_3d = time.strftime( '%Y%m%d', time.gmtime( time.mktime(time.strptime(ts_now, '%Y%m%d')) - 3 * day_in_seconds)) ts_3d = [x for x in list_of_partitions if x <= ts_ex_3d][-1] #content = '''beeline.bln_e_output(qry0 % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full.tmp')) ''' my_retrial(id='2W summary (no load)', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d) #content = '''beeline.bln_e_output(qry % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full_wloads.tmp')) ''' my_retrial(id='2W summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d) #content = '''beeline.bln_e_output(qry % (ts_now, ts_28d), os.path.join(config.mrqos_data, 'processed_4wjoin_full_wloads.tmp')) ''' my_retrial(id='4W summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_28d) #content = '''beeline.bln_e_output(qry % (ts_now, ts_3d), os.path.join(config.mrqos_data, 'processed_3djoin_full_wloads.tmp')) ''' my_retrial(id='3D summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_3d) # new summary (with in-out-ratio) my_retrial(id='3Dn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_3d) my_retrial(id='2Wn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d) my_retrial(id='4Wn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_28d)
def main(): # logging set-up logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_join.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting ts = int(time.time()) logger.info('########### ts=%s ###########' % str(ts)) #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts))) #hourstamp = time.strftime('%H', time.gmtime(float(ts))) # IO-Ratio Join: last_mrqos_region_partition = beeline.get_last_partitions('mrqos.mrqos_region') [datestamp, hourstamp, ts_region] = [x.split('=')[1] for x in last_mrqos_region_partition.split('/')] logger.info('MRQOS mrqos_region partition: datestamp=%s, hour=%s, ts_region=%s' % (datestamp, hourstamp, ts_region)) mapruleinfo_partitions = [x for x in sorted(beeline.show_partitions('mrqos.maprule_info').split('\n'),reverse=True) if '=' in x] mapruleinfo_partitions = [x for x in mapruleinfo_partitions if x < 'ts=%s' % ts_region] ts_mapruleinfo = mapruleinfo_partitions[0].split('=')[1] logger.info('MRQOS maprule_info partition: ts_mapruleinfo=%s' % ts_mapruleinfo) region_summary_retrial_max = 10 # ############################### # # The In-Out Ratio hive procedure # # ############################### # # check if the summary has been performed on this particular hour (last hour) # print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_table, 'mrqos_ioratio', 'datestamp=%s' % datestamp, 'hour=%s' % hourstamp, 'ts=%s' % ts_region, '000000_0.deflate')): logger.info(' Joined file not exist.') f = open(os.path.join(config.mrqos_hive_query, 'mrqos_ioratio.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, ts_region, datestamp, hourstamp, ts_region, ts_mapruleinfo) print strcmd_s f.close() # strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp) # query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) logger.info(' ****** success with time cost = %s.' % str(time.time()-tic)) break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.error(' ****** failed with time cost = %s upto # retrials=%s' % (str(time.time()-tic), str(count_retrial))) logger.error('error %s' % e.message) hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_ioratio', 'datestamp=%s' % datestamp, 'hour=%s' % hourstamp, 'ts=%s' % ts_region), r=True) count_retrial += 1 else: logger.info(' Joined file exists.')
def main(): # set up the logger logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'ra_summary.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # table nsjoin (day, uuid) # table mapmon (day, uuid) datenow = str(datetime.date.today()-datetime.timedelta(1)) day_idx = datenow[0:4]+datenow[5:7]+datenow[8:10] uuid_list = [x.split('=')[-1] for x in beeline.show_partitions('mrqos.mapmon_sum').split('\n') if day_idx in x] sc = SparkContext() hiveCtx = HiveContext(sc) post_partition_n = 1000 for uuid_idx in uuid_list: # ns_ip, demand, asnum ns_asnum, ns_country, ns_continent, ns_lat, ns_lon, ns_mpgid, mpgload nsjoin_query = """ select ns_ip, demand, asnum ns_asnum, country_code ns_country, continent ns_continent, round(latitude,3) ns_lat, round(longitude,3) ns_lon, mpgid ns_mpgid, mpgload from mapper.nsjoin where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL and demand > 1""".format(day_idx, uuid_idx) # mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp mapmon_query = """ select mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp from mrqos.mapmon_sum where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL""".format(day_idx, uuid_idx) logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx)) nsjoin = hiveCtx.sql(nsjoin_query) nsjoin_rows = nsjoin.repartition(post_partition_n).cache() data = hiveCtx.sql(mapmon_query) data_rows = data.repartition(post_partition_n).cache() col = ['mpgid', 'mrid', 'mpg_type', 'region', 'link', 'min_s', 'max_s', 'min_r', 'max_r', 'ping', 'local', 'cont_fb', 'mpd_dftime', 'ecor', 'continent', 'country', 'latitude', 'longitude', 'prp', 'ns_ip', 'demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgload'] cols_appended = ['nsip', 'mrid', 'ns_demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgid', 'mpg_type', 'mpg_load', 'regions', 'region_links', 'dftime_ratio', 'ecors', 'list_min_s', 'list_max_s', 'list_min_r', 'list_max_r', 'region_lats', 'region_lons', 'min_s', 'max_s', 'min_r', 'max_r', 'ping_ratio', 'local_ratio', 'cont_fb_ratio', 'in_cont_ratio', 'in_country_ratio', 'private_ratio', 'avg_distance', 'num_region_mapped', 'mapping_entropy', 'sum_dftime'] df = nsjoin_rows.join(data_rows, data_rows.mpgid == nsjoin_rows.ns_mpgid, 'inner')[col].cache() row1 = data_rows.agg(F.max(data_rows.mpd_dftime)).collect()[0] max_dftime = row1[0] df2 = df.map(lambda x: x + Row(geodesic_distance_weighted(x.ns_lat, x.ns_lon, x.latitude, x.longitude, x.mpd_dftime)))\ .map(lambda x: (( x[19], # nsip x[20], # demand x[21], # ns_asnum x[22], # ns_country x[23], # ns_continent round(x[24], 3), # ns_lat & ns_lon round(x[25], 3), x[0], # mpgid x[1], # mrid x[2], # mpg type x[26], # mpg load ), [ [int(x[3])], # region [str(int(x[3])) + "_" + str(int(x[4]))], # region_link x[5]/max_dftime, # min_s x[6]/max_dftime, # max_s x[7]/max_dftime, # min_r x[8]/max_dftime, # max_r x[9]/max_dftime, # ping ratio x[10]/max_dftime, # local ratio x[11]/max_dftime, # cont_fb ratio [round(x[12]/max_dftime, 3)], # mpd_dftime/max_dftime (time ratio) [int(x[13])], # ecor x[12]/max_dftime * [0, 1][x[14] == x[23]], # mapping in-continent ratio x[12]/max_dftime * [0, 1][x[15] == x[22]], # mapping in-country ratio [round(x[16], 3)], # lat [round(x[17], 3)], # lon x[18]/max_dftime, # prp x[27]/max_dftime, # w_distance x[12], [round(x[5]/x[12], 2)], # min_s list [round(x[6]/x[12], 2)], # max_s list [round(x[7]/x[12], 2)], # min_r list [round(x[8]/x[12], 2)], # max_r list ]))\ .reduceByKey(lambda a, b: [x+y for x, y in zip(a, b)])\ .map(lambda x: [x[0][0], # nsip x[0][8], # mrid x[0][1], # demand x[0][2], # ns_asnum x[0][3], # ns_country x[0][4], # ns_continent x[0][5], # ns_lat x[0][6], # ns_lon x[0][7], # mpgid x[0][9], # mpg type x[0][10], # mpg load x[1][0], # list of region x[1][1], # list of region_link [round(100 * float(y), 2) for y in x[1][9]], # list of covered_record ratio x[1][10], # list of ecor x[1][13], # list of region lat x[1][14], # list of region lon round(x[1][2] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_s round(x[1][3] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_s round(x[1][4] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_r round(x[1][5] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_r round(100 * x[1][6] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # ping ratio round(100 * x[1][7] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # local ratio round(100 * x[1][8] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # cont_fb ratio round(100 * x[1][11] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-continent ratio round(100 * x[1][12] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-country ratio round(100 * x[1][15] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # private ratio round(x[1][16] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # w_distance round(x[1][17], 3), # summation of covered dftime x[1][18], # list of min_s x[1][19], # list of max_s x[1][20], # list of min_r x[1][21], # list of max_r len(x[1][9]), # number of different regions mapped round(computeEntropyPMF(x[1][9]), 6), # entropy of the region assignments ])\ .map(lambda x: x + [[i[0] for i in sorted(enumerate([float(y) for y in x[13]]), key=lambda z:z[1], reverse=True)]])\ .map(lambda x: x[:11] + [':'.join([str(x[11][i]) for i in x[35]]), # list of region ':'.join([str(x[12][i]) for i in x[35]]), # list of region_link ':'.join([str(x[13][i]) for i in x[35]]), # list of covered_record ratio ':'.join([str(x[14][i]) for i in x[35]]), # list of ecor ':'.join([str(x[29][i]) for i in x[35]]), # list of min_s ':'.join([str(x[30][i]) for i in x[35]]), # list of max_s ':'.join([str(x[31][i]) for i in x[35]]), # list of min_r ':'.join([str(x[32][i]) for i in x[35]]), # list of max_r ':'.join([str(x[15][i]) for i in x[35]]), # list of region lat ':'.join([str(x[16][i]) for i in x[35]]), # list of region lon ] + x[17:28] + x[33:35] + [x[28]])\ .toDF(cols_appended).cache() df_all = df2.map(lambda x: toCSVLine(x)) logger.info('writing into HDFS') df_all.saveAsTextFile('/ghostcache/hadoop/data/MRQOS/mrqos_mapmon_stats/datestamp={}/uuid={}'.format(day_idx, uuid_idx)) logger.info('updating Hive table: mrqos_mapmon_stats') beeline.add_partitions("mrqos.mrqos_mapmon_stats","datestamp='{}',uuid='{}'".format(day_idx, uuid_idx))
def main(): # logging set-up logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting # ############################## ts = int(time.time()) ts_timeout = ts - config.mrqos_table_delete * 24 * 3 # 3 days = (24*3) hours of time-out date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout))) # hourstamp = time.strftime('%H', time.gmtime(float(ts))) list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.mrqos_region').split('\n')]))) list_to_clean = [x for x in list_to_clean if ('=' in x and x.split('=')[1] < date_timeout)] logger.info('handling table: mrqos_region') try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_region', '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: datestamp<%s' % str(date_timeout)) beeline.drop_partitions(tablename='mrqos.mrqos_region', condition='datestamp<%s' % str(date_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message) # ############################## # target table: maprule_info, mcm_machines # ############################## query_item = ['maprule_info', 'mcm_machines'] for scan in query_item: logger.info('handling table: %s' % scan) list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.%s' % scan).split('\n')]))) list_to_clean = [x for x in list_to_clean if ('=' in x and int(x.split('=')[1]) < ts_timeout)] try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, '%s' % scan, '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: ts<%s' % str(ts_timeout)) beeline.drop_partitions(tablename='mrqos.%s' % scan, condition='ts<%s' % str(ts_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message)