Exemple #1
0
def mrqos_join_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    joined_partitions = hdfsutil.ls(config.hdfs_table_join)
    str_parts_list = [i.split('=', 1)[1] for i in joined_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold
    timenow = int(time.time())

    # get the list of retired data in HDFS using hive partitions
    try:
        hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\
                            if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)]
        try:
            # drop the partitions in hive
            beeline.drop_partitions('mrqos.mrqos_join', 'ts<%s' % str(timenow-config.mrqos_join_delete))
            print " drop partitions successful. "
            # remove the hdfs folders
            for partition_id in hdfs_remove_list:
                try:
                    hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join', '%s' % str(partition_id))
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed to remove HDFS folder for mrqos_join at partition folder %s" % str(partition_id)
            print " remove HDFS successful. "
        except sp.CalledProcessError as e:
            print ">> failed to drop partitions"
    except sp.CalledProcessError as e:
        print ">> failed to obtain retire partition list (HIVE)"
        print e.message
def mrqos_join_cleanupv2(logger):
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    joined_partitions = hdfsutil.ls(config.hdfs_table_join2)
    str_parts_list = [i.split('=', 1)[1] for i in joined_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold
    timenow = int(time.time())

    # get the list of retired data in HDFS using hive partitions
    try:
        hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\
                            if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)]
        try:
            # drop the partitions in hive
            beeline.drop_partitions('mrqos.mrqos_join2', 'ts<%s' % str(timenow-config.mrqos_join_delete))
            logger.info("drop hive partitions successful. ")
            # remove the hdfs folders
            for partition_id in hdfs_remove_list:
                try:
                    hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join2', '%s' % str(partition_id))
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    logger.info('failed to remove HDFS folder for mrqos_join at partition folder %s' % str(partition_id))
            logger.info('remove HDFS successful. ')
        except sp.CalledProcessError as e:
            logger.error('failed to drop partitions. ')
    except sp.CalledProcessError as e:
        logger.error('failed to obtain retire partition list (HIVE)')
        logger.error('error message: %s' % e.message)
def main():
    """  this function will do the query on 5 different measurement and upload
    the data to hdfs accordingly, this also join tables at single time point """

    # different queries (various types)
    # logging set-up
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mrqos_sum_comparison.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    # ##############################
    n_retrial = config.query_retrial
    day_in_seconds = 86400

    list_of_partitions = [x.split('=')[-1] for x in beeline.show_partitions('mrqos.mrqos_sum').split('\n') if '=' in x]
    ts_now = list_of_partitions[-1]

    ts_ex_14d = time.strftime('%Y%m%d',
                              time.gmtime(time.mktime(time.strptime(ts_now,
                                                                    '%Y%m%d')) - 14 * day_in_seconds))
    ts_14d = [x for x in list_of_partitions if x <= ts_ex_14d][-1]

    ts_ex_28d = time.strftime('%Y%m%d',
                              time.gmtime(time.mktime(time.strptime(ts_now,
                                                                    '%Y%m%d')) - 28 * day_in_seconds))
    ts_28d = [x for x in list_of_partitions if x <= ts_ex_28d][-1]

    ts_ex_3d = time.strftime('%Y%m%d',
                             time.gmtime(time.mktime(time.strptime(ts_now,
                                                                   '%Y%m%d')) - 3 * day_in_seconds))
    ts_3d = [x for x in list_of_partitions if x <= ts_ex_3d][-1]

    #content = '''beeline.bln_e_output(qry0 % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full.tmp')) '''
    my_retrial(id='2W summary (no load)', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d)
    #content = '''beeline.bln_e_output(qry % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full_wloads.tmp')) '''
    my_retrial(id='2W summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d)
    #content = '''beeline.bln_e_output(qry % (ts_now, ts_28d), os.path.join(config.mrqos_data, 'processed_4wjoin_full_wloads.tmp')) '''
    my_retrial(id='4W summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_28d)
    #content = '''beeline.bln_e_output(qry % (ts_now, ts_3d), os.path.join(config.mrqos_data, 'processed_3djoin_full_wloads.tmp')) '''
    my_retrial(id='3D summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_3d)

    # new summary (with in-out-ratio)
    my_retrial(id='3Dn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_3d)
    my_retrial(id='2Wn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_14d)
    my_retrial(id='4Wn summary', n_retrial=n_retrial, logger=logger, ts1=ts_now, ts2=ts_28d)
def main():
    # set up the logger
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'),
                            level=logging.INFO,
                            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # NSJOIN dayidx # only partitioned by DAY
    day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1]
    # BAREBONES dayidx # only partitioned by DAY
    day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x]
    # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid)
    mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/')
    [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data]

    if day_idx != day_mps:
        logger.error('mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.')
        return

    if len(day_bb) == 0:
        logger.warning('mapper.barebone data missing for this particular day.')
        #return

    logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

    logger.info('begin spark process.')
    getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx)
    geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx
    geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx

    sc = SparkContext()
    hiveCtx = HiveContext(sc)

    rows = hiveCtx.sql(getting_mappoint_data)

    regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx))
    geo_total_cap = hiveCtx.sql(geo_total_cap_query)
    geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query)


    # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]]
    region_mpginfo_pair = rows.map(lambda x: [[x.mpgid,
                                               x.lat,
                                               x.lon,
                                               x.country,
                                               x.mpgload,
                                               x.asnum,
                                               x.ip], x.allowed_private_regions])\
                                .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]])

    #region_mpginfo_pair.first()

    # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]]
    # ps. prp=1: private, prp=0: public
    region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude,
                                                          x.longitude,
                                                          x.peak_bitcap_mbps,
                                                          x.peak_flitcap_mfps,
                                                          x.country,
                                                          x.numvips,
                                                          'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'),
                                                          1 if x.prp=='private' else 0]])\
                                .filter(lambda x: x[1][6]=='W' or x[1][6]=='S')

    region_public_list = region_latlon\
        .filter(lambda x: x[1][7] == 0)\
        .map(lambda x: ('all', [[x[0]]]))\
        .reduceByKey(lambda a, b: [a[0]+b[0]])\
        .map(lambda x: x[1][0]).collect()

    region_public_list = [0] + sorted(region_public_list[0])

    # dummy region
    rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])])
    region_latlon = region_latlon.union(rdd2)

    # perform the join into tuple of (K, (V1, V2):
    # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service]))
    # rdd  = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance],
    #               reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services,
    #               mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #               mpg-lat, mpg-lon)
    mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [x[1][0][0],
                                                                           x[0],
                                                                           geodesic_distance(x[1][0][1],
                                                                                             x[1][0][2],
                                                                                             x[1][1][0],
                                                                                             x[1][1][1]),
                                                                           round(float(x[1][1][2])/1000.0, 3),
                                                                           round(float(x[1][1][3])/1000.0, 3),
                                                                           x[1][1][4], # reg-country
                                                                           x[1][1][5], # reg-numvips
                                                                           x[1][1][6], # reg-services
                                                                           x[1][0][3],
                                                                           x[1][0][4],
                                                                           x[1][0][5],
                                                                           x[1][0][6],
                                                                           x[1][0][1],
                                                                           x[1][0][2]])

    # filtering on mapping distance < 500 miles
    # filtering on reg-country = mpg-country
    # filtering on region capacity fbps > 1Gbps
    # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #                        mpg-lat, mpg-lon])
    #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\
    #    .filter(lambda x: x[5] == x[8])\
    #    .filter(lambda x: x[3] > 1)\
    #    .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    # or this one, no-same-country constraint:
    mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\
        .filter(lambda x: x[3] > 1)\
        .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    #mpgid_reg_distance.first()

    # group by mpgid
    # rdd format = (mpgid, [[reg-list],
    #                       avg_distance,
    #                       total_cap freeflow,
    #                       total_cap essl,
    #                       total num vips,
    #                       rg_count,
    #                       mpg-country,
    #                       mpg-load,
    #                       [mpg-asnum],
    #                       [mpg-nsip])
    mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\
        .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4],
                                   a[5], a[6], a[7], a[8], a[9], a[10]])\
        .map(lambda x: (x[0], [sorted(x[1][0]), # region_list
                               round(x[1][1]/x[1][4], 2), # avg distance
                               round(x[1][2][0], 2), # total capacity - w
                               round(x[1][2][1], 2), # total capacity - s
                               x[1][3], # numvips
                               x[1][4], # total region count
                               x[1][5], # mpg country
                               x[1][6], # mpg load
                               x[1][7], # mpg asnum
                               x[1][8], # mpg nsip
                               x[1][9], # mpg lat
                               x[1][10]])) # mpg lon

    # disable the count
    #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count()

    # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips
    #                           reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]])
    reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\
        .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list
                                          x[1][1], # avg_distance
                                          x[1][2], # region total capacity freeflow
                                          x[1][3], # region total capacity essl
                                          x[1][4], # total num vips
                                          x[1][5], # total region count
                                          [x[1][6]], # mpg country list
                                          x[1][7], # mpg load
                                          1, # mpg-count
                                          x[1][8] if x[1][8] else [], # [mpg-asnum]
                                          x[1][9] if x[1][9] else [], # [mpg-nsip]
                                          [x[1][10]], # [mpg-lat] # single element array
                                          [x[1][11]], # [mpg-lon] # single element array
                                          [x[1][7]] # [mpg-load] # single element array
                                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0],
                                   a[1],
                                   a[2],
                                   a[3],
                                   a[4],
                                   a[5],
                                   a[6]+b[6],
                                   a[7]+b[7],
                                   a[8]+b[8],
                                   a[9]+b[9],
                                   a[10]+b[10],
                                   a[11]+b[11],
                                   a[12]+b[12],
                                   a[13]+b[13]])\
        .filter(lambda x: sum(x[1][13]) > 0.0001)\
        .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list
                               x[1][1], # avg_distance
                               x[1][2], # reg-cap-w
                               x[1][3], # reg-cap-s
                               x[1][4], # numvips
                               x[1][5], # reg-count
                               [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list
                               x[1][7], # mpg-load
                               x[1][8], # mpg-count
                               [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum]
                               [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip]
                               geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma]
                               ]))\
        .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list]
                        x[1][1], # avg_distance
                        x[1][2], # reg-cap-w
                        x[1][3], # reg-cap-s
                        x[1][4], # numvips
                        x[1][5], # reg-count
                        x[1][7], # mpg-load
                        x[1][8], # mpg-count
                        ':'.join([str(y) for y in x[0]]), # [region-list]
                        ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list]
                        ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum]
                        ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip]
                        x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma]
                        ],
                        region_public_list
                        ))\
        .flatMapValues(lambda x: x)\
        .map(lambda x: [x[1], x[0]])

    reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\
        .join(region_latlon)\
        .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0],
                                                             x[1][0][12][1],
                                                             x[1][1][0],
                                                             x[1][1][1])] + [x[0]] if x[0] > 0\
             else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0],
                                       x[1][0][12][1],
                                       x[1][1][0],
                                       x[1][1][1],
                                       0.0]] + [x[0]])\
        .filter(lambda x: x[2][4] < 500)\
        .map(lambda x: (tuple([x[0][0],
                              x[0][1],
                              x[0][2],
                              x[0][3],
                              x[0][4],
                              x[0][5],
                              x[0][6],
                              x[0][7],
                              x[0][8],
                              x[0][9],
                              x[0][10],
                              x[0][11],
                              x[0][12][0],
                              x[0][12][1],
                              x[0][12][2],
                              x[0][12][3]]), # mpg-information
                        [x[1][2], # pub.region.cap.ff
                         x[1][3], # pub.region.cap.essl
                         x[1][5], # pub.region.vip
                         [x[3]] # single element region id
                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff )
                                   a[1]+b[1], # sum( pub.region.cap.essl )
                                   a[2]+b[2], # sum( pub.region.cap.vip )
                                   a[3]+b[3] # [pub.regions]
                                   ])\
        .map(lambda x: [x[0][0], # [mpg-country-list]
                        x[0][1], # avg-distance
                        x[0][12], # mpg-lat
                        x[0][13], # mpg-lon
                        x[0][14], # mpg-por
                        x[0][15], # mpg-porsigma
                        x[0][2], # pri.region.cap.ff (gbps)
                        x[0][3], # pri.region.cap.essl (gbps)
                        x[0][4], # pri.vips
                        x[0][5], # pri.region.count
                        round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps)
                        round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps)
                        x[1][2], # pub.vips
                        len(x[1][3])-1, # pub.region.count
                        x[0][6], # mpg-load
                        round(x[0][7], 6), # mpg-count
                        x[0][8], # [pri reg-list]
                        ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list])
                        x[0][9], # [mpg-list]
                        x[0][10], # [mpg-assum]
                        x[0][11] # [mpg-nsip]
                        ])

    # data exporting to local
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame(columns=['cl_geoname',
                                                                                           'cl_avgDistance',
                                                                                           'cl_lat',
                                                                                           'cl_lon',
                                                                                           'cl_por',
                                                                                           'cl_porsigma',
                                                                                           'pri_cap_ff_gbps',
                                                                                           'pri_cap_essl_gbps',
                                                                                           'pri_nvips',
                                                                                           'pri_nReg',
                                                                                           'pub_cap_ff_gbps',
                                                                                           'pub_cap_essl_gbps',
                                                                                           'pub_nvips',
                                                                                           'pub_nReg',
                                                                                           'cl_mpgLoad',
                                                                                           'cl_nMpg',
                                                                                           'pri_regList',
                                                                                           'pub_regList',
                                                                                           'mpgList',
                                                                                           'mpgASList',
                                                                                           'mpgNSIPList'])

    geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect()

    logger.info('begin write to local disk.')
    for item in range(len(geo_cluster_full_info)):
        temp = geo_cluster_full_info[item]
        country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[item] = temp # the above should be temp[1][0] for the mpglist

    data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data'
    filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx)
    fileDestination = os.path.join(data_folder, filename)
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv(fileDestination,
                                                                          sep=',', index=False, header=False)

    logger.info('begin to upload to hdfs.')
    tablename = 'mrqos.mpg_cluster'
    hdfs_d = os.path.join(config.hdfs_table,
                          'mpg_cluster',
                          'datestamp=%s' % day_idx,
                          'uuid=%s' % uuid_idx)
    partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx)
    processed_filename = '.'.join(filename.split('.')[0:-1])+'.processed.csv'
    cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % (os.path.join(data_folder, filename),
                                                                                                                                                                                                                                                              os.path.join(data_folder, processed_filename))
    sp.check_call(cmd_str, shell=True)
    try:
        beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename, logger)
        # os.remove(fileDestination)
    except sp.CalledProcessError as e:
        logger.info('upload to HDFS + update Hive table failed.')
def main():
    # logging set-up
    logging.basicConfig(
        filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    # ##############################

    ts = int(time.time())
    ts_timeout = ts - config.mrqos_table_delete * 24 * 3  # 3 days = (24*3) hours of time-out

    date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout)))
    # hourstamp = time.strftime('%H', time.gmtime(float(ts)))

    list_to_clean = sorted(
        list(
            set([
                x.split('/')[0] for x in beeline.show_partitions(
                    'mrqos.mrqos_region').split('\n')
            ])))
    list_to_clean = [
        x for x in list_to_clean
        if ('=' in x and x.split('=')[1] < date_timeout)
    ]

    logger.info('handling table: mrqos_region')
    try:
        logger.info('removing the data in HDFS')
        # remove the hdfs folder
        for item in list_to_clean:
            hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_region',
                                     '%s' % item),
                        r=True)

        # alter the hive table: mrqos_region
        try:
            logger.info('drop partitions, condition: datestamp<%s' %
                        str(date_timeout))
            beeline.drop_partitions(tablename='mrqos.mrqos_region',
                                    condition='datestamp<%s' %
                                    str(date_timeout))
        except sp.CalledProcessError as e:
            logger.error('drop partition failed')
            logger.error('error: %s' % e.message)

    except sp.CalledProcessError as e:
        logger.error('removed data from hdfs failed')
        logger.error('error: %s' % e.message)

    # ##############################
    # target table: maprule_info, mcm_machines
    # ##############################

    query_item = ['maprule_info', 'mcm_machines']

    for scan in query_item:
        logger.info('handling table: %s' % scan)
        list_to_clean = sorted(
            list(
                set([
                    x.split('/')[0]
                    for x in beeline.show_partitions('mrqos.%s' %
                                                     scan).split('\n')
                ])))
        list_to_clean = [
            x for x in list_to_clean
            if ('=' in x and int(x.split('=')[1]) < ts_timeout)
        ]

        try:
            logger.info('removing the data in HDFS')
            # remove the hdfs folder
            for item in list_to_clean:
                hdfsutil.rm(os.path.join(config.hdfs_table, '%s' % scan,
                                         '%s' % item),
                            r=True)

            # alter the hive table: mrqos_region
            try:
                logger.info('drop partitions, condition: ts<%s' %
                            str(ts_timeout))
                beeline.drop_partitions(tablename='mrqos.%s' % scan,
                                        condition='ts<%s' % str(ts_timeout))
            except sp.CalledProcessError as e:
                logger.error('drop partition failed')
                logger.error('error: %s' % e.message)

        except sp.CalledProcessError as e:
            logger.error('removed data from hdfs failed')
            logger.error('error: %s' % e.message)
def main():
    # set up the logger
    logging.basicConfig(
        filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # NSJOIN dayidx # only partitioned by DAY
    day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1]
    # BAREBONES dayidx # only partitioned by DAY
    day_bb = [
        x for x in beeline.show_partitions('mapper.barebones').split('\n')
        if '=%s' % (day_idx) in x
    ]
    # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid)
    mappoints_data = sorted([
        x for x in beeline.show_partitions('mapper.mappoints').split('\n')
        if '=%s' % (day_idx) in x
    ])[-1].split('/')
    [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data]

    if day_idx != day_mps:
        logger.error(
            'mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.'
        )
        return

    if len(day_bb) == 0:
        logger.warning('mapper.barebone data missing for this particular day.')
        #return

    logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

    logger.info('begin spark process.')
    getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (
        day_idx, uuid_idx, day_idx, uuid_idx)
    geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx
    geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx

    sc = SparkContext()
    hiveCtx = HiveContext(sc)

    rows = hiveCtx.sql(getting_mappoint_data)

    regInfoRows = hiveCtx.sql(
        'select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null'
        % (day_idx))
    geo_total_cap = hiveCtx.sql(geo_total_cap_query)
    geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query)

    # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]]
    region_mpginfo_pair = rows.map(lambda x: [[x.mpgid,
                                               x.lat,
                                               x.lon,
                                               x.country,
                                               x.mpgload,
                                               x.asnum,
                                               x.ip], x.allowed_private_regions])\
                                .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]])

    #region_mpginfo_pair.first()

    # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]]
    # ps. prp=1: private, prp=0: public
    region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude,
                                                          x.longitude,
                                                          x.peak_bitcap_mbps,
                                                          x.peak_flitcap_mfps,
                                                          x.country,
                                                          x.numvips,
                                                          'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'),
                                                          1 if x.prp=='private' else 0]])\
                                .filter(lambda x: x[1][6]=='W' or x[1][6]=='S')

    region_public_list = region_latlon\
        .filter(lambda x: x[1][7] == 0)\
        .map(lambda x: ('all', [[x[0]]]))\
        .reduceByKey(lambda a, b: [a[0]+b[0]])\
        .map(lambda x: x[1][0]).collect()

    region_public_list = [0] + sorted(region_public_list[0])

    # dummy region
    rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])])
    region_latlon = region_latlon.union(rdd2)

    # perform the join into tuple of (K, (V1, V2):
    # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service]))
    # rdd  = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance],
    #               reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services,
    #               mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #               mpg-lat, mpg-lon)
    mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [
        x[1][0][0],
        x[0],
        geodesic_distance(x[1][0][1], x[1][0][2], x[1][1][0], x[1][1][1]),
        round(float(x[1][1][2]) / 1000.0, 3),
        round(float(x[1][1][3]) / 1000.0, 3),
        x[1][1][4],  # reg-country
        x[1][1][5],  # reg-numvips
        x[1][1][6],  # reg-services
        x[1][0][3],
        x[1][0][4],
        x[1][0][5],
        x[1][0][6],
        x[1][0][1],
        x[1][0][2]
    ])

    # filtering on mapping distance < 500 miles
    # filtering on reg-country = mpg-country
    # filtering on region capacity fbps > 1Gbps
    # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #                        mpg-lat, mpg-lon])
    #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\
    #    .filter(lambda x: x[5] == x[8])\
    #    .filter(lambda x: x[3] > 1)\
    #    .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    # or this one, no-same-country constraint:
    mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\
        .filter(lambda x: x[3] > 1)\
        .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    #mpgid_reg_distance.first()

    # group by mpgid
    # rdd format = (mpgid, [[reg-list],
    #                       avg_distance,
    #                       total_cap freeflow,
    #                       total_cap essl,
    #                       total num vips,
    #                       rg_count,
    #                       mpg-country,
    #                       mpg-load,
    #                       [mpg-asnum],
    #                       [mpg-nsip])
    mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\
        .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4],
                                   a[5], a[6], a[7], a[8], a[9], a[10]])\
        .map(lambda x: (x[0], [sorted(x[1][0]), # region_list
                               round(x[1][1]/x[1][4], 2), # avg distance
                               round(x[1][2][0], 2), # total capacity - w
                               round(x[1][2][1], 2), # total capacity - s
                               x[1][3], # numvips
                               x[1][4], # total region count
                               x[1][5], # mpg country
                               x[1][6], # mpg load
                               x[1][7], # mpg asnum
                               x[1][8], # mpg nsip
                               x[1][9], # mpg lat
                               x[1][10]])) # mpg lon

    # disable the count
    #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count()

    # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips
    #                           reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]])
    reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\
        .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list
                                          x[1][1], # avg_distance
                                          x[1][2], # region total capacity freeflow
                                          x[1][3], # region total capacity essl
                                          x[1][4], # total num vips
                                          x[1][5], # total region count
                                          [x[1][6]], # mpg country list
                                          x[1][7], # mpg load
                                          1, # mpg-count
                                          x[1][8] if x[1][8] else [], # [mpg-asnum]
                                          x[1][9] if x[1][9] else [], # [mpg-nsip]
                                          [x[1][10]], # [mpg-lat] # single element array
                                          [x[1][11]], # [mpg-lon] # single element array
                                          [x[1][7]] # [mpg-load] # single element array
                                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0],
                                   a[1],
                                   a[2],
                                   a[3],
                                   a[4],
                                   a[5],
                                   a[6]+b[6],
                                   a[7]+b[7],
                                   a[8]+b[8],
                                   a[9]+b[9],
                                   a[10]+b[10],
                                   a[11]+b[11],
                                   a[12]+b[12],
                                   a[13]+b[13]])\
        .filter(lambda x: sum(x[1][13]) > 0.0001)\
        .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list
                               x[1][1], # avg_distance
                               x[1][2], # reg-cap-w
                               x[1][3], # reg-cap-s
                               x[1][4], # numvips
                               x[1][5], # reg-count
                               [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list
                               x[1][7], # mpg-load
                               x[1][8], # mpg-count
                               [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum]
                               [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip]
                               geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma]
                               ]))\
        .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list]
                        x[1][1], # avg_distance
                        x[1][2], # reg-cap-w
                        x[1][3], # reg-cap-s
                        x[1][4], # numvips
                        x[1][5], # reg-count
                        x[1][7], # mpg-load
                        x[1][8], # mpg-count
                        ':'.join([str(y) for y in x[0]]), # [region-list]
                        ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list]
                        ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum]
                        ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip]
                        x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma]
                        ],
                        region_public_list
                        ))\
        .flatMapValues(lambda x: x)\
        .map(lambda x: [x[1], x[0]])

    reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\
        .join(region_latlon)\
        .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0],
                                                             x[1][0][12][1],
                                                             x[1][1][0],
                                                             x[1][1][1])] + [x[0]] if x[0] > 0\
             else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0],
                                       x[1][0][12][1],
                                       x[1][1][0],
                                       x[1][1][1],
                                       0.0]] + [x[0]])\
        .filter(lambda x: x[2][4] < 500)\
        .map(lambda x: (tuple([x[0][0],
                              x[0][1],
                              x[0][2],
                              x[0][3],
                              x[0][4],
                              x[0][5],
                              x[0][6],
                              x[0][7],
                              x[0][8],
                              x[0][9],
                              x[0][10],
                              x[0][11],
                              x[0][12][0],
                              x[0][12][1],
                              x[0][12][2],
                              x[0][12][3]]), # mpg-information
                        [x[1][2], # pub.region.cap.ff
                         x[1][3], # pub.region.cap.essl
                         x[1][5], # pub.region.vip
                         [x[3]] # single element region id
                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff )
                                   a[1]+b[1], # sum( pub.region.cap.essl )
                                   a[2]+b[2], # sum( pub.region.cap.vip )
                                   a[3]+b[3] # [pub.regions]
                                   ])\
        .map(lambda x: [x[0][0], # [mpg-country-list]
                        x[0][1], # avg-distance
                        x[0][12], # mpg-lat
                        x[0][13], # mpg-lon
                        x[0][14], # mpg-por
                        x[0][15], # mpg-porsigma
                        x[0][2], # pri.region.cap.ff (gbps)
                        x[0][3], # pri.region.cap.essl (gbps)
                        x[0][4], # pri.vips
                        x[0][5], # pri.region.count
                        round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps)
                        round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps)
                        x[1][2], # pub.vips
                        len(x[1][3])-1, # pub.region.count
                        x[0][6], # mpg-load
                        round(x[0][7], 6), # mpg-count
                        x[0][8], # [pri reg-list]
                        ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list])
                        x[0][9], # [mpg-list]
                        x[0][10], # [mpg-assum]
                        x[0][11] # [mpg-nsip]
                        ])

    # data exporting to local
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame(
        columns=[
            'cl_geoname', 'cl_avgDistance', 'cl_lat', 'cl_lon', 'cl_por',
            'cl_porsigma', 'pri_cap_ff_gbps', 'pri_cap_essl_gbps', 'pri_nvips',
            'pri_nReg', 'pub_cap_ff_gbps', 'pub_cap_essl_gbps', 'pub_nvips',
            'pub_nReg', 'cl_mpgLoad', 'cl_nMpg', 'pri_regList', 'pub_regList',
            'mpgList', 'mpgASList', 'mpgNSIPList'
        ])

    geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect(
    )

    logger.info('begin write to local disk.')
    for item in range(len(geo_cluster_full_info)):
        temp = geo_cluster_full_info[item]
        country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[
            item] = temp  # the above should be temp[1][0] for the mpglist

    data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data'
    filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx)
    fileDestination = os.path.join(data_folder, filename)
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv(
        fileDestination, sep=',', index=False, header=False)

    logger.info('begin to upload to hdfs.')
    tablename = 'mrqos.mpg_cluster'
    hdfs_d = os.path.join(config.hdfs_table, 'mpg_cluster',
                          'datestamp=%s' % day_idx, 'uuid=%s' % uuid_idx)
    partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx)
    processed_filename = '.'.join(filename.split('.')[0:-1]) + '.processed.csv'
    cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % (
        os.path.join(data_folder,
                     filename), os.path.join(data_folder, processed_filename))
    sp.check_call(cmd_str, shell=True)
    try:
        beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename,
                               logger)
        # os.remove(fileDestination)
    except sp.CalledProcessError as e:
        logger.info('upload to HDFS + update Hive table failed.')
def main():
    # set up the logger
    logging.basicConfig(
        filename=os.path.join(config.mrqos_logging, 'ra_summary.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)
    # table nsjoin (day, uuid)
    # table mapmon (day, uuid)
    datenow = str(datetime.date.today() - datetime.timedelta(1))
    day_idx = datenow[0:4] + datenow[5:7] + datenow[8:10]
    uuid_list = [
        x.split('=')[-1]
        for x in beeline.show_partitions('mrqos.mapmon_sum').split('\n')
        if day_idx in x
    ]
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    post_partition_n = 1000

    for uuid_idx in uuid_list:
        # ns_ip, demand, asnum ns_asnum, ns_country, ns_continent, ns_lat, ns_lon, ns_mpgid, mpgload
        nsjoin_query = """ select ns_ip, demand, asnum ns_asnum, country_code ns_country, continent ns_continent, round(latitude,3) ns_lat, round(longitude,3) ns_lon, mpgid ns_mpgid, mpgload from mapper.nsjoin where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL and demand > 1""".format(
            day_idx, uuid_idx)

        # mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp
        mapmon_query = """ select mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp from mrqos.mapmon_sum where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL""".format(
            day_idx, uuid_idx)
        logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

        nsjoin = hiveCtx.sql(nsjoin_query)
        nsjoin_rows = nsjoin.repartition(post_partition_n).cache()
        data = hiveCtx.sql(mapmon_query)
        data_rows = data.repartition(post_partition_n).cache()

        col = [
            'mpgid', 'mrid', 'mpg_type', 'region', 'link', 'min_s', 'max_s',
            'min_r', 'max_r', 'ping', 'local', 'cont_fb', 'mpd_dftime', 'ecor',
            'continent', 'country', 'latitude', 'longitude', 'prp', 'ns_ip',
            'demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat',
            'ns_lon', 'mpgload'
        ]

        cols_appended = [
            'nsip', 'mrid', 'ns_demand', 'ns_asnum', 'ns_country',
            'ns_continent', 'ns_lat', 'ns_lon', 'mpgid', 'mpg_type',
            'mpg_load', 'regions', 'region_links', 'dftime_ratio', 'ecors',
            'list_min_s', 'list_max_s', 'list_min_r', 'list_max_r',
            'region_lats', 'region_lons', 'min_s', 'max_s', 'min_r', 'max_r',
            'ping_ratio', 'local_ratio', 'cont_fb_ratio', 'in_cont_ratio',
            'in_country_ratio', 'private_ratio', 'avg_distance',
            'num_region_mapped', 'mapping_entropy', 'sum_dftime'
        ]

        df = nsjoin_rows.join(data_rows,
                              data_rows.mpgid == nsjoin_rows.ns_mpgid,
                              'inner')[col].cache()
        row1 = data_rows.agg(F.max(data_rows.mpd_dftime)).collect()[0]
        max_dftime = row1[0]

        df2 = df.map(lambda x: x + Row(geodesic_distance_weighted(x.ns_lat,
                                                                  x.ns_lon,
                                                                  x.latitude,
                                                                  x.longitude,
                                                                  x.mpd_dftime)))\
                .map(lambda x: ((   x[19], # nsip
                                    x[20], # demand
                                    x[21], # ns_asnum
                                    x[22], # ns_country
                                    x[23], # ns_continent
                                    round(x[24], 3), # ns_lat & ns_lon
                                    round(x[25], 3),
                                    x[0], # mpgid
                                    x[1], # mrid
                                    x[2], # mpg type
                                    x[26], # mpg load
                                    ),
                               [   [int(x[3])], # region
                                   [str(int(x[3])) + "_" + str(int(x[4]))], # region_link
                                   x[5]/max_dftime, # min_s
                                   x[6]/max_dftime, # max_s
                                   x[7]/max_dftime, # min_r
                                   x[8]/max_dftime, # max_r
                                   x[9]/max_dftime, # ping ratio
                                   x[10]/max_dftime, # local ratio
                                   x[11]/max_dftime, # cont_fb ratio
                                   [round(x[12]/max_dftime, 3)], # mpd_dftime/max_dftime (time ratio)
                                   [int(x[13])], # ecor
                                   x[12]/max_dftime * [0, 1][x[14] == x[23]], # mapping in-continent ratio
                                   x[12]/max_dftime * [0, 1][x[15] == x[22]], # mapping in-country ratio
                                   [round(x[16], 3)], # lat
                                   [round(x[17], 3)], # lon
                                   x[18]/max_dftime, # prp
                                   x[27]/max_dftime, # w_distance
                                   x[12],
                                   [round(x[5]/x[12], 2)], # min_s list
                                   [round(x[6]/x[12], 2)], # max_s list
                                   [round(x[7]/x[12], 2)], # min_r list
                                   [round(x[8]/x[12], 2)], # max_r list
                               ]))\
                .reduceByKey(lambda a, b: [x+y for x, y in zip(a, b)])\
                .map(lambda x: [x[0][0], # nsip
                                x[0][8], # mrid
                                x[0][1], # demand
                                x[0][2], # ns_asnum
                                x[0][3], # ns_country
                                x[0][4], # ns_continent
                                x[0][5], # ns_lat
                                x[0][6], # ns_lon
                                x[0][7], # mpgid
                                x[0][9], # mpg type
                                x[0][10], # mpg load
                                x[1][0], # list of region
                                x[1][1], # list of region_link
                                [round(100 * float(y), 2) for y in x[1][9]], # list of covered_record ratio
                                x[1][10], # list of ecor
                                x[1][13], # list of region lat
                                x[1][14], # list of region lon
                                round(x[1][2] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_s
                                round(x[1][3] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_s
                                round(x[1][4] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_r
                                round(x[1][5] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_r
                                round(100 * x[1][6] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # ping ratio
                                round(100 * x[1][7] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # local ratio
                                round(100 * x[1][8] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # cont_fb ratio
                                round(100 * x[1][11] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-continent ratio
                                round(100 * x[1][12] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-country ratio
                                round(100 * x[1][15] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # private ratio
                                round(x[1][16] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # w_distance
                                round(x[1][17], 3), # summation of covered dftime
                                x[1][18], # list of min_s
                                x[1][19], # list of max_s
                                x[1][20], # list of min_r
                                x[1][21], # list of max_r
                                len(x[1][9]), # number of different regions mapped
                                round(computeEntropyPMF(x[1][9]), 6), # entropy of the region assignments
                                ])\
                .map(lambda x: x + [[i[0] for i in sorted(enumerate([float(y) for y in x[13]]), key=lambda z:z[1], reverse=True)]])\
                .map(lambda x: x[:11] + [':'.join([str(x[11][i]) for i in x[35]]), # list of region
                                         ':'.join([str(x[12][i]) for i in x[35]]), # list of region_link
                                         ':'.join([str(x[13][i]) for i in x[35]]), # list of covered_record ratio
                                         ':'.join([str(x[14][i]) for i in x[35]]), # list of ecor
                                         ':'.join([str(x[29][i]) for i in x[35]]), # list of min_s
                                         ':'.join([str(x[30][i]) for i in x[35]]), # list of max_s
                                         ':'.join([str(x[31][i]) for i in x[35]]), # list of min_r
                                         ':'.join([str(x[32][i]) for i in x[35]]), # list of max_r
                                         ':'.join([str(x[15][i]) for i in x[35]]), # list of region lat
                                         ':'.join([str(x[16][i]) for i in x[35]]), # list of region lon
                                         ] + x[17:28] + x[33:35] + [x[28]])\
                .toDF(cols_appended).cache()

        df_all = df2.map(lambda x: toCSVLine(x))
        logger.info('writing into HDFS')
        df_all.saveAsTextFile(
            '/ghostcache/hadoop/data/MRQOS/mrqos_mapmon_stats/datestamp={}/uuid={}'
            .format(day_idx, uuid_idx))
        logger.info('updating Hive table: mrqos_mapmon_stats')
        beeline.add_partitions(
            "mrqos.mrqos_mapmon_stats",
            "datestamp='{}',uuid='{}'".format(day_idx, uuid_idx))
def main():
    """  this function will do the query on 5 different measurement and upload
    the data to hdfs accordingly, this also join tables at single time point """

    # different queries (various types)
    # logging set-up
    logging.basicConfig(
        filename=os.path.join(config.mrqos_logging,
                              'mrqos_sum_comparison.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    # ##############################
    n_retrial = config.query_retrial
    day_in_seconds = 86400

    list_of_partitions = [
        x.split('=')[-1]
        for x in beeline.show_partitions('mrqos.mrqos_sum').split('\n')
        if '=' in x
    ]
    ts_now = list_of_partitions[-1]

    ts_ex_14d = time.strftime(
        '%Y%m%d',
        time.gmtime(
            time.mktime(time.strptime(ts_now, '%Y%m%d')) -
            14 * day_in_seconds))
    ts_14d = [x for x in list_of_partitions if x <= ts_ex_14d][-1]

    ts_ex_28d = time.strftime(
        '%Y%m%d',
        time.gmtime(
            time.mktime(time.strptime(ts_now, '%Y%m%d')) -
            28 * day_in_seconds))
    ts_28d = [x for x in list_of_partitions if x <= ts_ex_28d][-1]

    ts_ex_3d = time.strftime(
        '%Y%m%d',
        time.gmtime(
            time.mktime(time.strptime(ts_now, '%Y%m%d')) - 3 * day_in_seconds))
    ts_3d = [x for x in list_of_partitions if x <= ts_ex_3d][-1]

    #content = '''beeline.bln_e_output(qry0 % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full.tmp')) '''
    my_retrial(id='2W summary (no load)',
               n_retrial=n_retrial,
               logger=logger,
               ts1=ts_now,
               ts2=ts_14d)
    #content = '''beeline.bln_e_output(qry % (ts_now, ts_14d), os.path.join(config.mrqos_data, 'processed_2wjoin_full_wloads.tmp')) '''
    my_retrial(id='2W summary',
               n_retrial=n_retrial,
               logger=logger,
               ts1=ts_now,
               ts2=ts_14d)
    #content = '''beeline.bln_e_output(qry % (ts_now, ts_28d), os.path.join(config.mrqos_data, 'processed_4wjoin_full_wloads.tmp')) '''
    my_retrial(id='4W summary',
               n_retrial=n_retrial,
               logger=logger,
               ts1=ts_now,
               ts2=ts_28d)
    #content = '''beeline.bln_e_output(qry % (ts_now, ts_3d), os.path.join(config.mrqos_data, 'processed_3djoin_full_wloads.tmp')) '''
    my_retrial(id='3D summary',
               n_retrial=n_retrial,
               logger=logger,
               ts1=ts_now,
               ts2=ts_3d)

    # new summary (with in-out-ratio)
    my_retrial(id='3Dn summary',
               n_retrial=n_retrial,
               logger=logger,
               ts1=ts_now,
               ts2=ts_3d)
    my_retrial(id='2Wn summary',
               n_retrial=n_retrial,
               logger=logger,
               ts1=ts_now,
               ts2=ts_14d)
    my_retrial(id='4Wn summary',
               n_retrial=n_retrial,
               logger=logger,
               ts1=ts_now,
               ts2=ts_28d)
def main():
    # logging set-up
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_join.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting

    ts = int(time.time())
    logger.info('########### ts=%s ###########' % str(ts))
    #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts)))
    #hourstamp = time.strftime('%H', time.gmtime(float(ts)))

    # IO-Ratio Join:
    last_mrqos_region_partition = beeline.get_last_partitions('mrqos.mrqos_region')
    [datestamp, hourstamp, ts_region] = [x.split('=')[1] for x in last_mrqos_region_partition.split('/')]
    logger.info('MRQOS mrqos_region partition: datestamp=%s, hour=%s, ts_region=%s' % (datestamp,
                                                                                 hourstamp,
                                                                                 ts_region))

    mapruleinfo_partitions = [x for x in sorted(beeline.show_partitions('mrqos.maprule_info').split('\n'),reverse=True) if '=' in x]
    mapruleinfo_partitions = [x for x in mapruleinfo_partitions if x < 'ts=%s' % ts_region]
    ts_mapruleinfo = mapruleinfo_partitions[0].split('=')[1]
    logger.info('MRQOS maprule_info partition: ts_mapruleinfo=%s' % ts_mapruleinfo)

    region_summary_retrial_max = 10

    # ############################### #
    # The In-Out Ratio hive procedure #
    # ############################### #
    # check if the summary has been performed on this particular hour (last hour)
    # print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(os.path.join(config.hdfs_table,
                                       'mrqos_ioratio',
                                       'datestamp=%s' % datestamp,
                                       'hour=%s' % hourstamp,
                                       'ts=%s' % ts_region,
                                       '000000_0.deflate')):
        logger.info(' Joined file not exist.')
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_ioratio.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, ts_region,
                             datestamp, hourstamp, ts_region,
                             ts_mapruleinfo)
        print strcmd_s
        f.close()
        # strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        # query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

        print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            tic = time.time()
            try:
                beeline.bln_e(strcmd_s)
                logger.info('    ******  success with time cost = %s.' % str(time.time()-tic))
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                logger.error('    ******  failed with time cost = %s upto # retrials=%s' % (str(time.time()-tic), str(count_retrial)))
                logger.error('error %s' % e.message)
                hdfsutil.rm(os.path.join(config.hdfs_table,
                                         'mrqos_ioratio',
                                         'datestamp=%s' % datestamp,
                                         'hour=%s' % hourstamp,
                                         'ts=%s' % ts_region), r=True)
                count_retrial += 1
    else:
        logger.info(' Joined file exists.')
def main():
    # set up the logger
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'ra_summary.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)
    # table nsjoin (day, uuid)
    # table mapmon (day, uuid)
    datenow = str(datetime.date.today()-datetime.timedelta(1))
    day_idx = datenow[0:4]+datenow[5:7]+datenow[8:10]
    uuid_list = [x.split('=')[-1] for x in beeline.show_partitions('mrqos.mapmon_sum').split('\n') if day_idx in x]
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    post_partition_n = 1000

    for uuid_idx in uuid_list:
        # ns_ip, demand, asnum ns_asnum, ns_country, ns_continent, ns_lat, ns_lon, ns_mpgid, mpgload
        nsjoin_query = """ select ns_ip, demand, asnum ns_asnum, country_code ns_country, continent ns_continent, round(latitude,3) ns_lat, round(longitude,3) ns_lon, mpgid ns_mpgid, mpgload from mapper.nsjoin where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL and demand > 1""".format(day_idx,
                                                                                                                                                                                                                                                                                                                            uuid_idx)

        # mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp
        mapmon_query = """ select mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp from mrqos.mapmon_sum where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL""".format(day_idx,
                                                                                                                                                                                                                                                                                                          uuid_idx)
        logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

        nsjoin = hiveCtx.sql(nsjoin_query)
        nsjoin_rows = nsjoin.repartition(post_partition_n).cache()
        data = hiveCtx.sql(mapmon_query)
        data_rows = data.repartition(post_partition_n).cache()

        col = ['mpgid', 'mrid', 'mpg_type', 'region', 'link', 'min_s', 'max_s', 'min_r', 'max_r',
               'ping', 'local', 'cont_fb', 'mpd_dftime', 'ecor', 'continent', 'country', 'latitude', 'longitude', 'prp',
               'ns_ip', 'demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgload']

        cols_appended = ['nsip', 'mrid', 'ns_demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon',
                         'mpgid', 'mpg_type', 'mpg_load', 'regions', 'region_links', 'dftime_ratio', 'ecors',
                         'list_min_s', 'list_max_s', 'list_min_r', 'list_max_r',
                         'region_lats', 'region_lons', 'min_s', 'max_s', 'min_r', 'max_r', 'ping_ratio', 'local_ratio',
                         'cont_fb_ratio', 'in_cont_ratio', 'in_country_ratio', 'private_ratio', 'avg_distance',
                         'num_region_mapped', 'mapping_entropy', 'sum_dftime']

        df = nsjoin_rows.join(data_rows, data_rows.mpgid == nsjoin_rows.ns_mpgid, 'inner')[col].cache()
        row1 = data_rows.agg(F.max(data_rows.mpd_dftime)).collect()[0]
        max_dftime = row1[0]

        df2 = df.map(lambda x: x + Row(geodesic_distance_weighted(x.ns_lat,
                                                                  x.ns_lon,
                                                                  x.latitude,
                                                                  x.longitude,
                                                                  x.mpd_dftime)))\
                .map(lambda x: ((   x[19], # nsip
                                    x[20], # demand
                                    x[21], # ns_asnum
                                    x[22], # ns_country
                                    x[23], # ns_continent
                                    round(x[24], 3), # ns_lat & ns_lon
                                    round(x[25], 3),
                                    x[0], # mpgid
                                    x[1], # mrid
                                    x[2], # mpg type
                                    x[26], # mpg load
                                    ),
                               [   [int(x[3])], # region
                                   [str(int(x[3])) + "_" + str(int(x[4]))], # region_link
                                   x[5]/max_dftime, # min_s
                                   x[6]/max_dftime, # max_s
                                   x[7]/max_dftime, # min_r
                                   x[8]/max_dftime, # max_r
                                   x[9]/max_dftime, # ping ratio
                                   x[10]/max_dftime, # local ratio
                                   x[11]/max_dftime, # cont_fb ratio
                                   [round(x[12]/max_dftime, 3)], # mpd_dftime/max_dftime (time ratio)
                                   [int(x[13])], # ecor
                                   x[12]/max_dftime * [0, 1][x[14] == x[23]], # mapping in-continent ratio
                                   x[12]/max_dftime * [0, 1][x[15] == x[22]], # mapping in-country ratio
                                   [round(x[16], 3)], # lat
                                   [round(x[17], 3)], # lon
                                   x[18]/max_dftime, # prp
                                   x[27]/max_dftime, # w_distance
                                   x[12],
                                   [round(x[5]/x[12], 2)], # min_s list
                                   [round(x[6]/x[12], 2)], # max_s list
                                   [round(x[7]/x[12], 2)], # min_r list
                                   [round(x[8]/x[12], 2)], # max_r list
                               ]))\
                .reduceByKey(lambda a, b: [x+y for x, y in zip(a, b)])\
                .map(lambda x: [x[0][0], # nsip
                                x[0][8], # mrid
                                x[0][1], # demand
                                x[0][2], # ns_asnum
                                x[0][3], # ns_country
                                x[0][4], # ns_continent
                                x[0][5], # ns_lat
                                x[0][6], # ns_lon
                                x[0][7], # mpgid
                                x[0][9], # mpg type
                                x[0][10], # mpg load
                                x[1][0], # list of region
                                x[1][1], # list of region_link
                                [round(100 * float(y), 2) for y in x[1][9]], # list of covered_record ratio
                                x[1][10], # list of ecor
                                x[1][13], # list of region lat
                                x[1][14], # list of region lon
                                round(x[1][2] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_s
                                round(x[1][3] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_s
                                round(x[1][4] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_r
                                round(x[1][5] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_r
                                round(100 * x[1][6] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # ping ratio
                                round(100 * x[1][7] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # local ratio
                                round(100 * x[1][8] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # cont_fb ratio
                                round(100 * x[1][11] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-continent ratio
                                round(100 * x[1][12] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-country ratio
                                round(100 * x[1][15] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # private ratio
                                round(x[1][16] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # w_distance
                                round(x[1][17], 3), # summation of covered dftime
                                x[1][18], # list of min_s
                                x[1][19], # list of max_s
                                x[1][20], # list of min_r
                                x[1][21], # list of max_r
                                len(x[1][9]), # number of different regions mapped
                                round(computeEntropyPMF(x[1][9]), 6), # entropy of the region assignments
                                ])\
                .map(lambda x: x + [[i[0] for i in sorted(enumerate([float(y) for y in x[13]]), key=lambda z:z[1], reverse=True)]])\
                .map(lambda x: x[:11] + [':'.join([str(x[11][i]) for i in x[35]]), # list of region
                                         ':'.join([str(x[12][i]) for i in x[35]]), # list of region_link
                                         ':'.join([str(x[13][i]) for i in x[35]]), # list of covered_record ratio
                                         ':'.join([str(x[14][i]) for i in x[35]]), # list of ecor
                                         ':'.join([str(x[29][i]) for i in x[35]]), # list of min_s
                                         ':'.join([str(x[30][i]) for i in x[35]]), # list of max_s
                                         ':'.join([str(x[31][i]) for i in x[35]]), # list of min_r
                                         ':'.join([str(x[32][i]) for i in x[35]]), # list of max_r
                                         ':'.join([str(x[15][i]) for i in x[35]]), # list of region lat
                                         ':'.join([str(x[16][i]) for i in x[35]]), # list of region lon
                                         ] + x[17:28] + x[33:35] + [x[28]])\
                .toDF(cols_appended).cache()

        df_all = df2.map(lambda x: toCSVLine(x))
        logger.info('writing into HDFS')
        df_all.saveAsTextFile('/ghostcache/hadoop/data/MRQOS/mrqos_mapmon_stats/datestamp={}/uuid={}'.format(day_idx,
                                                                                                             uuid_idx))
        logger.info('updating Hive table: mrqos_mapmon_stats')
        beeline.add_partitions("mrqos.mrqos_mapmon_stats","datestamp='{}',uuid='{}'".format(day_idx,
                                                                                            uuid_idx))
def main():
    # logging set-up
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    # ##############################

    ts = int(time.time())
    ts_timeout = ts - config.mrqos_table_delete * 24 * 3 # 3 days = (24*3) hours of time-out

    date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout)))
    # hourstamp = time.strftime('%H', time.gmtime(float(ts)))

    list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.mrqos_region').split('\n')])))
    list_to_clean = [x for x in list_to_clean if ('=' in x and x.split('=')[1] < date_timeout)]

    logger.info('handling table: mrqos_region')
    try:
        logger.info('removing the data in HDFS')
        # remove the hdfs folder
        for item in list_to_clean:
            hdfsutil.rm(os.path.join(config.hdfs_table,
                                     'mrqos_region',
                                     '%s' % item),
                        r=True)

        # alter the hive table: mrqos_region
        try:
            logger.info('drop partitions, condition: datestamp<%s' % str(date_timeout))
            beeline.drop_partitions(tablename='mrqos.mrqos_region',
                                    condition='datestamp<%s' % str(date_timeout))
        except sp.CalledProcessError as e:
            logger.error('drop partition failed')
            logger.error('error: %s' % e.message)

    except sp.CalledProcessError as e:
        logger.error('removed data from hdfs failed')
        logger.error('error: %s' % e.message)

    # ##############################
    # target table: maprule_info, mcm_machines
    # ##############################

    query_item = ['maprule_info', 'mcm_machines']

    for scan in query_item:
        logger.info('handling table: %s' % scan)
        list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.%s' % scan).split('\n')])))
        list_to_clean = [x for x in list_to_clean if ('=' in x and int(x.split('=')[1]) < ts_timeout)]

        try:
            logger.info('removing the data in HDFS')
            # remove the hdfs folder
            for item in list_to_clean:
                hdfsutil.rm(os.path.join(config.hdfs_table,
                                         '%s' % scan,
                                         '%s' % item),
                            r=True)

            # alter the hive table: mrqos_region
            try:
                logger.info('drop partitions, condition: ts<%s' % str(ts_timeout))
                beeline.drop_partitions(tablename='mrqos.%s' % scan,
                                        condition='ts<%s' % str(ts_timeout))
            except sp.CalledProcessError as e:
                logger.error('drop partition failed')
                logger.error('error: %s' % e.message)

        except sp.CalledProcessError as e:
            logger.error('removed data from hdfs failed')
            logger.error('error: %s' % e.message)