コード例 #1
0
def main():
    """ get the date for the past day (yesterday). """
    timenow = int(time.time())
    datenow = str(datetime.date.today()-datetime.timedelta(1))
    datenow = datenow[0:4]+datenow[5:7]+datenow[8:10]

    print "###################"
    print "# Start processing the data back in " + datenow + " (yesterday)"
    print "# starting processing time is " + str(timenow)
    print "###################"

    ts = calendar.timegm(time.gmtime())
    ts_last_hour = ts-3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))

    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s." % (datestamp),
    if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_day % (datestamp), '000000_0.deflate')):
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_day.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, datestamp, datestamp)
        f.close()
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hourstamp)
        try:
            beeline.bln_e(strcmd_s)
        except:
            # delete the folder if summarization failed.
            print "    ****  summarization failed, removed hdfs folder."
            hdfsutil.rm(config.hdfs_qos_rg_day % (datestamp), r=True)
    else:
        print " file exists."
コード例 #2
0
ファイル: query_jobs.py プロジェクト: YuTengChang/akam_mrqos
def mrqos_table_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    score_partitions = hdfsutil.ls(config.hdfs_table_score)
    str_parts_list = [i.split('=', 1)[1] for i in score_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs
    timenow = int(time.time())
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']
    for item in mtype:
        exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item)
        str_parts_list = [i.split('=', 1)[1] for i in this_partitions]
        str_parts_list_int = map(int, str_parts_list)
        print "      ##  for table: %s" % item
        print "      ##  ",
        print str_parts_list_int

        for partition in str_parts_list_int:
            if partition < timenow - config.mrqos_table_delete:
                try:
                    print "      ##  handling table: %s with ts=%s" % (item, str(partition))
                    # drop partitions (ok even if partition does not exist)
                    hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(partition)
                    beeline.bln_e(hiveql_str)
                    # remove data from HDFS (ok even if folder in hdfs does not exist)
                    hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition)
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed in hive table clean up in table: %s." % item
                    print e.message
コード例 #3
0
def main():
    # logging set-up
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_window_summarize.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    max_retrial = 10
    ts = int(time.time())
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts)))
    window_length = config.mrqos_join_delete + 1*24*60*60
    datestamp_14d_ago = time.strftime('%Y%m%d', time.gmtime(float(ts-window_length)))
    logger.info('## Summarize IORATIO table started at %s.' % str(ts))

    logger.info("direct summarize and insert into mrqos_sum_io.")
    # direct join and insert in hive
    f = open('/home/testgrp/MRQOS/mrqos_hive_query/MRQOS_table_summarize_ioratio.hive', 'r')
    strcmd = f.read()
    strcmd_s = strcmd % (str(datestamp), str(datestamp_14d_ago), str(datestamp))
    f.close()
    logger.info("  ****  perform beeline for ioratio join.")
    retrial = 0
    while retrial < max_retrial:
        try:
            tic = time.time()
            beeline.bln_e(strcmd_s)
            logger.info('perform beeline for ioratio for 2W timeframe succeeded with time cost = %s second' % str(time.time()-tic))
        except sp.CalledProcessError as e:
            retrial += 1
            logger.error('perform beeline for ioratio for 2W timeframe failed.')
            logger.error('error message: %s', e.message)
コード例 #4
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts-3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials="+str(count_retrial)
                hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                count_retrial += 1

    else:
        print " file exists."

    # check if the summary has been performed since the beginning of the day, last check on day X is X+1/0:30:00
    for hour in hour_list:
        if hour < hourstamp:
            print "    ****  checking day = %s, hour = %s." % (datestamp, hour),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hour), '000000_0.deflate')):
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hour, datestamp, hour, datestamp, hour)
                f.close()
                print "    ****  perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hour)
                try:
                    beeline.bln_e(strcmd_s)
                except:
                    # delete the folder if summarization failed.
                    print "    ****  summarization failed, removed hdfs folder."
                    hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hour), r=True)
            else:
                print " file exists."
コード例 #5
0
def main():

    datestamp = '20160316'
    hourstamp = '04'

    # test0 the original order of join
    f = open(
        os.path.join(config.mrqos_hive_query,
                     'test0_mrqos_region_view_hour.hive'), 'r')
    strcmd = f.read()
    strcmd_s1 = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
    f.close()

    # test the reverse order of join
    f = open(
        os.path.join(config.mrqos_hive_query,
                     'test_mrqos_region_view_hour.hive'), 'r')
    strcmd = f.read()
    strcmd_s2 = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
    f.close()

    fail_count = [0] * 2
    time_count = [0] * 2

    iter = 10

    for item in range(iter):
        tic = time.time()
        fail0 = False
        fail1 = False
        try:
            beeline.bln_e(strcmd_s1)
            span1 = time.time() - tic
            time_count[0] += span1
        except:
            span1 = time.time() - tic
            fail_count[0] += 1
            fail0 = True

        tic = time.time()
        try:
            beeline.bln_e(strcmd_s2)
            span2 = time.time() - tic
            time_count[1] += span2
        except:
            span2 = time.time() - tic
            fail_count[1] += 1
            fail1 = True

        print "test0 takes %s (%s) and test1 takes %s (%s)" % (
            str(span1), "failed" if fail0 else "ok", str(span2),
            "failed" if fail1 else "ok")

    print "<<< overall result >>>"
    print "test0 takes %s and test1 takes %s" % (str(
        time_count[0] /
        (iter - fail_count[0])), str(time_count[1] / (iter - fail_count[1])))
コード例 #6
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts-3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    hour_list = [x for x in hour_list if x <= hourstamp]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    folders_day = '/'.join(str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1])

    # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one
    if hdfsutil.test_dic(folders_day):
        hdfsutil.mkdir(folders_day)

    folders_in = [folders_day+'/hour=%s' % x for x in hour_list]
    folders_out = hdfsutil.ls(folders_day)

    folders_missing = [x for x in folders_in if x not in folders_out]
    folders_missing.sort(reverse=True)

    for item in folders_missing:
        hourstamp = item[-2:]
        print "    ****  missing data for day = %s, hour = %s." % (datestamp, hourstamp),
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials="+str(count_retrial)
                print "    ****  ",
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                count_retrial += 1
コード例 #7
0
def main():

    datestamp = '20160316'
    hourstamp = '04'

    # test0 the original order of join
    f = open(os.path.join(config.mrqos_hive_query, 'test0_mrqos_region_view_hour.hive'), 'r')
    strcmd = f.read()
    strcmd_s1 = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
    f.close()

    # test the reverse order of join
    f = open(os.path.join(config.mrqos_hive_query, 'test_mrqos_region_view_hour.hive'), 'r')
    strcmd = f.read()
    strcmd_s2 = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
    f.close()

    fail_count = [0] * 2
    time_count = [0] * 2

    iter = 10

    for item in range(iter):
        tic = time.time()
        fail0 = False
        fail1 = False
        try:
            beeline.bln_e(strcmd_s1)
            span1 = time.time()-tic
            time_count[0] += span1
        except:
            span1 = time.time()-tic
            fail_count[0] += 1
            fail0 = True

        tic = time.time()
        try:
            beeline.bln_e(strcmd_s2)
            span2 = time.time()-tic
            time_count[1] += span2
        except:
            span2 = time.time()-tic
            fail_count[1] += 1
            fail1 = True

        print "test0 takes %s (%s) and test1 takes %s (%s)" % (str(span1),
                                                               "failed" if fail0 else "ok",
                                                               str(span2),
                                                               "failed" if fail1 else "ok")

    print "<<< overall result >>>"
    print "test0 takes %s and test1 takes %s" % (str(time_count[0]/(iter-fail_count[0])),
                                                 str(time_count[1]/(iter-fail_count[1])))
コード例 #8
0
def main():
    # initialze the logger
    logging.basicConfig(
        filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    timenow = int(time.time())
    datenow = str(datetime.date.today() - datetime.timedelta(1))
    date_idx = datenow[0:4] + datenow[5:7] + datenow[8:10]

    # get the latest barebone day_idx
    bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1]
    logger.info("barebone index: day={}".format(bb_day_idx))

    # get the latest mpd yesterday
    uuid_list = [
        x.split('=')[-1] for x in hdfsutil.ls(
            os.path.join(os.path.dirname(config.hdfs_table), 'mapper',
                         'mapmon', 'day={}'.format(date_idx)))
    ]
    for uuid_idx in uuid_list:
        logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx))
        file_location = os.path.join(config.hdfs_table, 'mapmon_sum',
                                     'day={}'.format(date_idx),
                                     'mpd_uuid={}'.format(uuid_idx))
        if hdfsutil.test_dic(file_location):
            logger.info('creating folder: {}'.format(file_location))
            hdfsutil.mkdir(file_location)

        if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')):
            f = open(
                os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'),
                'r')
            strcmd = f.read()
            strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx,
                                 uuid_idx, date_idx, uuid_idx)
            f.close()
            try:
                beeline.bln_e(strcmd_s)
            except:
                # delete the folder if summarization failed.
                logger.warn("summarization failed, removing hdfs folder.")
                hdfsutil.rm(file_location, r=True)
        else:
            logger.info(" file exists.")
コード例 #9
0
def cleanup_mrqos_region_related_tables(datestamp, hour):
    tables = ['mrqos_region_hour', 'case_view_hour', 'region_view_hour']
    for table_item in tables:
        try:
            # drop partitions (ok even if partition does not exist)
            hiveql_str = 'use mrqos; alter table %s drop if exists partition(datestamp=%s, hour=%s)' % (table_item,
                                                                                                        str(datestamp),
                                                                                                        str(hour))
            beeline.bln_e(hiveql_str)
            # remove data from HDFS (ok even if folder in hdfs does not exist)
            hdfs_d = os.path.join(config.hdfs_table, table_item, 'datestamp=%s' % str(datestamp), 'hour=%s' % str(hour))
            hdfsutil.rm(hdfs_d, r=True)
        except sp.CalledProcessError:
            print ">> failed in hive table clean up in table: %s for partition datestamp=%s, hour=%s." % (table_item,
                                                                                                          str(datestamp),
                                                                                                          str(hour))
            pass
コード例 #10
0
def main():
    # logging set-up
    logging.basicConfig(
        filename=os.path.join(config.mrqos_logging,
                              'io_ratio_window_summarize.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    max_retrial = 10
    ts = int(time.time())
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts)))
    window_length = config.mrqos_join_delete + 1 * 24 * 60 * 60
    datestamp_14d_ago = time.strftime('%Y%m%d',
                                      time.gmtime(float(ts - window_length)))
    logger.info('## Summarize IORATIO table started at %s.' % str(ts))

    logger.info("direct summarize and insert into mrqos_sum_io.")
    # direct join and insert in hive
    f = open(
        '/home/testgrp/MRQOS/mrqos_hive_query/MRQOS_table_summarize_ioratio.hive',
        'r')
    strcmd = f.read()
    strcmd_s = strcmd % (str(datestamp), str(datestamp_14d_ago),
                         str(datestamp))
    f.close()
    logger.info("  ****  perform beeline for ioratio join.")
    retrial = 0
    while retrial < max_retrial:
        try:
            tic = time.time()
            beeline.bln_e(strcmd_s)
            logger.info(
                'perform beeline for ioratio for 2W timeframe succeeded with time cost = %s second'
                % str(time.time() - tic))
        except sp.CalledProcessError as e:
            retrial += 1
            logger.error(
                'perform beeline for ioratio for 2W timeframe failed.')
            logger.error('error message: %s', e.message)
コード例 #11
0
def main():
    # initialze the logger
    logging.basicConfig(filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    timenow = int(time.time())
    datenow = str(datetime.date.today()-datetime.timedelta(1))
    date_idx = datenow[0:4]+datenow[5:7]+datenow[8:10]

    # get the latest barebone day_idx
    bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1]
    logger.info("barebone index: day={}".format(bb_day_idx))

    # get the latest mpd yesterday
    uuid_list = [x.split('=')[-1] for x in hdfsutil.ls(os.path.join(os.path.dirname(config.hdfs_table),'mapper','mapmon','day={}'.format(date_idx)))]
    for uuid_idx in uuid_list:
        logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx))
        file_location = os.path.join(config.hdfs_table,
                                           'mapmon_sum',
                                           'day={}'.format(date_idx),
                                           'mpd_uuid={}'.format(uuid_idx))
        if hdfsutil.test_dic(file_location):
            logger.info('creating folder: {}'.format(file_location))
            hdfsutil.mkdir(file_location)


        if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')):
            f = open(os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx,
                                 date_idx, uuid_idx,
                                 date_idx, uuid_idx)
            f.close()
            try:
                beeline.bln_e(strcmd_s)
            except:
                # delete the folder if summarization failed.
                logger.warn("summarization failed, removing hdfs folder.")
                hdfsutil.rm(file_location, r=True)
        else:
            logger.info(" file exists.")
コード例 #12
0
def main():
    # #### MRQOS region LOCAL PART ####
    # ignore the local timestamp, use what files are tagged
    timenow = time.time()
    print "###################"
    print "# Performing the hourly mrqos_region insert"
    print "# starting processing time is " + str(timenow)
    print "###################"

    list_qos_files = glob.glob( os.path.join(config.mrqos_data,
                                            'qos_region.*.tmp') ) # glob get the full path
    for qos_file in list_qos_files:
        infoitem = qos_file.rsplit('.',2)
        ts = infoitem[-2]
        datestamp = time.strftime('%Y%m%d', time.localtime(float(ts)))
        # do we need hourly partition or not?
        hourstamp = time.strftime('%H', time.localtime(float(ts)))

        print '    file = ' + qos_file
        print '    timestamp = %s;' % ( ts )

        # put the file to HDFS folder and remove from Local
        try:
            print '    upload to HDFS'
            hdfs_rg_destination = config.hdfs_qos_rg_info % ( datestamp, hourstamp, ts )
            hdfs.mkdir( hdfs_rg_destination )
            hdfs.put( qos_file, hdfs_rg_destination )

            print '    adding partition'
            hiveql_str = config.add_rg_partition % ( datestamp, hourstamp, ts )
            beeline.bln_e(hiveql_str)

            print '    remove local file: ' + qos_file
            os.remove(qos_file)
        except sp.CalledProcessError as e:
            print e.message
            print 'MRQOS region(RG) information update failed for timestamp=%s' % ( ts )
            if 'File exists' in e.message:
                print '    remove local file: ' + qos_file
                os.remove(qos_file)
コード例 #13
0
ファイル: query_jobs.py プロジェクト: YuTengChang/akam_mrqos
def upload_to_hive(listname, hdfs_d, ts, tablename):
    """ this function will create a partition directory in hdfs with the requisite timestamp. It will
    then add the partition to the table cl_ns_pp with the appropriate timestamp """

    # hdfs_d = config.hdfsclnspp % (ts)
    # create the partition
    try:
        sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d])
    # upload the data
    except sp.CalledProcessError:
        raise HadoopDirectoryCreateError
    try:
        sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d])
    except sp.CalledProcessError:
        raise HadoopDataUploadError

    # add the partition
    try:
        hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(ts=%s);' % (ts)
        beeline.bln_e(hiveql_str)
        # sp.check_call(['hive', '-e', hiveql_str])
    except sp.CalledProcessError:
        raise HiveCreatePartitionError
コード例 #14
0
ファイル: query_jobs.py プロジェクト: YuTengChang/akam_mrqos
def upload_to_hive(listname, hdfs_d, ts, tablename):
    """ this function will create a partition directory in hdfs with the requisite timestamp. It will
    then add the partition to the table cl_ns_pp with the appropriate timestamp """

    # hdfs_d = config.hdfsclnspp % (ts)
    # create the partition
    try:
        sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d])
    # upload the data
    except sp.CalledProcessError:
        raise HadoopDirectoryCreateError
    try:
        sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d])
    except sp.CalledProcessError:
        raise HadoopDataUploadError

    # add the partition
    try:
        hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(ts=%s);' % (
            ts)
        beeline.bln_e(hiveql_str)
        # sp.check_call(['hive', '-e', hiveql_str])
    except sp.CalledProcessError:
        raise HiveCreatePartitionError
コード例 #15
0
ファイル: query_jobs.py プロジェクト: YuTengChang/akam_mrqos
def mrqos_table_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    score_partitions = hdfsutil.ls(config.hdfs_table_score)
    str_parts_list = [i.split('=', 1)[1] for i in score_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs
    timenow = int(time.time())
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']
    for item in mtype:
        exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item)
        str_parts_list = [i.split('=', 1)[1] for i in this_partitions]
        str_parts_list_int = map(int, str_parts_list)
        print "      ##  for table: %s" % item
        print "      ##  ",
        print str_parts_list_int

        for partition in str_parts_list_int:
            if partition < timenow - config.mrqos_table_delete:
                try:
                    print "      ##  handling table: %s with ts=%s" % (
                        item, str(partition))
                    # drop partitions (ok even if partition does not exist)
                    hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(
                        partition)
                    beeline.bln_e(hiveql_str)
                    # remove data from HDFS (ok even if folder in hdfs does not exist)
                    hdfs_d = os.path.join(config.hdfs_table, item,
                                          'ts=%s' % partition)
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed in hive table clean up in table: %s." % item
                    print e.message
コード例 #16
0
ファイル: query_jobs.py プロジェクト: YuTengChang/akam_mrqos
def main():
    """  this function will do the query on 5 different measurement and upload
    the data to hdfs accordingly, this also join tables at single time point """

    # different queries (various types)
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']

    sql = """sql2 -q map.mapnoccthree.query.akadns.net --csv "`cat """
    post = """`" | tail -n+3 | awk -F"," 'BEGIN{OFS=","}{$1=""; print $0}' | sed 's/^,//g' > """

    # current time
    timenow = int(time.time())

    print "###################"
    print "Start processing the data back in for 10 minute joins"
    print "starting processing time is " + str(timenow)
    print "###################"

    # fetch the data through query with retrials
    print "    ****  querying mrqos data."
    for item in mtype:
        flag = 0
        count = 0
        dest = os.path.join(config.mrqos_data, item + '.tmp')
        aggs = os.path.join(config.mrqos_query, item + '.qr')

        cmd = sql + aggs + post + dest
        n_retrial = config.query_retrial
        t_timeout = config.query_timeout
        # multiple times with timeout scheme
        while (flag == 0) and (count < n_retrial):
            try:
                with ytt.Timeout(t_timeout):
                    sp.call(cmd, shell=True)
                    flag = 1
            except:
                count += 1
        # if any of the query not fetched successfully, break all and stop running
        if count >= n_retrial:
            print ">> data fetch failed in querying table %s" % item
            return

    # provide SCORE table with peak/off-peak attribute
    print "    ****  provide PEAK in score."
    sp.call([config.provide_peak], shell=True)

    # backup the individual query file by copying to backup folder
    print "    ****  backing up queried results."
    if not os.path.exists('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)):
        os.makedirs('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow))
        for item in mtype:
            filesrc = os.path.join(config.mrqos_data, item + '.tmp')
            filedst = '/home/testgrp/MRQOS/mrqos_data/backup/%s/' % str(timenow)
            shutil.copy(filesrc, filedst)

    # upload to hdfs and link to hive tables
    print "    ****  uploading to hdfs and hive."
    try:
        # adding the individual query result to hdfs and add hive partitions
        for item in mtype:
            listname = os.path.join(config.mrqos_data, item + '.tmp')
            hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % str(timenow))
            upload_to_hive(listname, hdfs_d, str(timenow), item)
        shutil.rmtree('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow))

        # new version of the join tables in hive: direct insert #
        # specify the new joined file in hdfs
        hdfs_file = os.path.join(config.hdfs_table, 'mrqos_join', 'ts=%s' % str(timenow), '000000_0.deflate')
        # specify the local copy of the joined file
        local_file = os.path.join(config.mrqos_data_backup, '000000_0.deflate')
        try:
            print "    ****  direct join and insert into mrqos_join."
            # direct join and insert in hive
            f = open('/home/testgrp/MRQOS/MRQOS_table_join2.hive', 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (str(timenow), str(timenow), str(timenow), str(timenow), str(timenow), str(timenow))
            f.close()
            print "    ****  perform beeline for join."
            beeline.bln_e(strcmd_s)
            # have the local copy of the joined file
            print "    ****  copy the joined file for backup."
            hdfsutil.get(hdfs_file, local_file)
        except sp.CalledProcessError as e:
            print ">> direct join and insert failed, trying to copy the last succeeded one"
            print e.message
            try:
                # upload the last succeeded one from local
                print "    ****  copying backups from local to hdfs"
                hdfsutil.put(local_file, hdfs_file)
                try:
                    # using hive to add partitions to joined query results
                    print "    ****  adding hive partitions"
                    hiveql_str = 'use mrqos; alter table mrqos_join add partition(ts=%s);' % str(timenow)
                    beeline.bln_e(hiveql_str)
                except sp.CalledProcessError as e:
                    print ">> copying from duplicated file for mrqos_join failed in adding partitions"
                    print e.message
                    #raise HiveCreatePartitionError
            except:
                print "copying from duplicated file for mrqos_join failed in uploading to hdfs"
    except:
        print "HDFS upload failed, backup file retains"

    # clear the expired data in mrqos_table
    print "    ****  clean up mrqos individual table."
    mrqos_table_cleanup()
    # clear the expired data in mrqos_join
    print "    ****  clean up mrqos joined table."
    mrqos_join_cleanup()
コード例 #17
0
def main():
    # logging set-up
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_join.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting

    ts = int(time.time())
    logger.info('########### ts=%s ###########' % str(ts))
    #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts)))
    #hourstamp = time.strftime('%H', time.gmtime(float(ts)))

    # IO-Ratio Join:
    last_mrqos_region_partition = beeline.get_last_partitions('mrqos.mrqos_region')
    [datestamp, hourstamp, ts_region] = [x.split('=')[1] for x in last_mrqos_region_partition.split('/')]
    logger.info('MRQOS mrqos_region partition: datestamp=%s, hour=%s, ts_region=%s' % (datestamp,
                                                                                 hourstamp,
                                                                                 ts_region))

    mapruleinfo_partitions = [x for x in sorted(beeline.show_partitions('mrqos.maprule_info').split('\n'),reverse=True) if '=' in x]
    mapruleinfo_partitions = [x for x in mapruleinfo_partitions if x < 'ts=%s' % ts_region]
    ts_mapruleinfo = mapruleinfo_partitions[0].split('=')[1]
    logger.info('MRQOS maprule_info partition: ts_mapruleinfo=%s' % ts_mapruleinfo)

    region_summary_retrial_max = 10

    # ############################### #
    # The In-Out Ratio hive procedure #
    # ############################### #
    # check if the summary has been performed on this particular hour (last hour)
    # print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(os.path.join(config.hdfs_table,
                                       'mrqos_ioratio',
                                       'datestamp=%s' % datestamp,
                                       'hour=%s' % hourstamp,
                                       'ts=%s' % ts_region,
                                       '000000_0.deflate')):
        logger.info(' Joined file not exist.')
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_ioratio.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, ts_region,
                             datestamp, hourstamp, ts_region,
                             ts_mapruleinfo)
        print strcmd_s
        f.close()
        # strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        # query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

        print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            tic = time.time()
            try:
                beeline.bln_e(strcmd_s)
                logger.info('    ******  success with time cost = %s.' % str(time.time()-tic))
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                logger.error('    ******  failed with time cost = %s upto # retrials=%s' % (str(time.time()-tic), str(count_retrial)))
                logger.error('error %s' % e.message)
                hdfsutil.rm(os.path.join(config.hdfs_table,
                                         'mrqos_ioratio',
                                         'datestamp=%s' % datestamp,
                                         'hour=%s' % hourstamp,
                                         'ts=%s' % ts_region), r=True)
                count_retrial += 1
    else:
        logger.info(' Joined file exists.')
コード例 #18
0
def main(argv):
    """ get the date and hour for the specified day and hour. Clean(drop) and rebuild the table partition. """
    try:
        opts, args = getopt.getopt(argv,"qd:h:",["datestamp=","hour="])
    except getopt.GetoptError:
        print 'region_summary_hour_repair.py -d <datestamp> -h <hour>'
        sys.exit(2)

    hour =''
    datestamp = ''

    for opt, arg in opts:
        if opt == '-q':
            print 'region_summary_hour_repair.py -d <datestamp> -h <hour>'
            sys.exit()
        elif opt in ("-d", "--datestamp"):
            datestamp = arg
        elif opt in ("-h", "--hour"):
            hour = arg

    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the repair of the  mrqos_region summary"
    print "# starting processing time is " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts))
    print "###################"

    if (not datestamp and not hour):
        print 'region_summary_hour_repair.py -d <datestamp> -h <hour>'
        sys.exit(2)

    print 'Fixing datestamp = %s' % datestamp

    if not hour:
        hour_list = [str("%02d" % x) for x in range(24)]
        print 'Fixing hour = %s' % hour_list
    else:
        print 'Fixing hour = %s' % hour

    #ts_last_hour = ts-3600
    #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    #hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    #hour_list = [str("%02d" % x) for x in range(24)]
    region_summary_retrial_max = 10


    print "    #**** first perform table cleanups: "
    if not hour:
        for hourstamp in hour_list:
            cleanup_mrqos_region_related_tables(datestamp, hourstamp)
    else:
        hourstamp = hour
        cleanup_mrqos_region_related_tables(datestamp, hourstamp)

    print "    #**** rebuild the db / table: "
    if not hour:
        for hourstamp in hour_list:
            # ############################### #
            # The SUMMARY HOUR hive procedure #
            # ############################### #
            print "    ****  summary hour tour:"
            # check if the summary has been performed on this particular hour (last hour)
            print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')):
                print " file not exits,",
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
                f.close()
                strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp)
                query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

                print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp)
                count_retrial = 0
                while count_retrial < region_summary_retrial_max:
                    tic = time.time()
                    try:
                        beeline.bln_e(strcmd_s)
                        print "    ******  success with time cost = %s." % str(time.time()-tic)
                        #try:
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        #except:
                        #    print "    ****  copy to local failed, retry!"
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        break
                    except:
                        # delete the folder if summarization failed.
                        print "    ******  failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial))
                        hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True)
                        count_retrial += 1
            else:
                print " file exists."


            # ############################ #
            # The CASE VIEW hive procedure #
            # ############################ #
            print "    ****  case view tour:"
            # check if the summary has been performed on this particular hour (last hour)
            print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
                print " file not exits,",
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
                f.close()
                strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
                query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))
                print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
                count_retrial = 0
                while count_retrial < region_summary_retrial_max:
                    try:
                        tic = time.time()
                        beeline.bln_e(strcmd_s)
                        print "    ******  success with time cost = %s." % str(time.time()-tic)
                        # repair don't care about moving the result to SQLite DB
                        #try:
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        #except:
                        #    print "    ****  copy to local failed, retry!"
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        break
                    except:
                        # delete the folder if summarization failed.
                        print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                        hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True)
                        count_retrial += 1

            else:
                print " file exists."


            # ############################## #
            # The REGION VIEW hive procedure #
            # ############################## #
            print "    ****  region view tour:"
            # check if the summary has been performed on this particular hour (last hour)
            print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
                print " file not exits,",
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
                f.close()
                strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
                query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
                print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
                count_retrial = 0
                while count_retrial < region_summary_retrial_max:
                    try:
                        tic = time.time()
                        beeline.bln_e(strcmd_s)
                        print "    ******  success with time cost = %s." % str(time.time()-tic)
                        # repair don't care about moving the result to SQLite DB
                        #try:
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        #except:
                        #    print "    ****  copy to local failed, retry!"
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        break
                    except:
                        # delete the folder if summarization failed.
                        print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                        hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                        count_retrial += 1
            else:
                print " file exists."

    else:
        # ############################### #
        # The SUMMARY HOUR hive procedure #
        # ############################### #
        print "    ****  summary hour tour:"
        # check if the summary has been performed on this particular hour (last hour)
        print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')):
            print " file not exits,",
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

            print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp)
            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                tic = time.time()
                try:
                    beeline.bln_e(strcmd_s)
                    print "    ******  success with time cost = %s." % str(time.time()-tic)
                    #try:
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    #except:
                    #    print "    ****  copy to local failed, retry!"
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    break
                except:
                    # delete the folder if summarization failed.
                    print "    ******  failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial))
                    hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1
        else:
            print " file exists."


        # ############################ #
        # The CASE VIEW hive procedure #
        # ############################ #
        print "    ****  case view tour:"
        # check if the summary has been performed on this particular hour (last hour)
        print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            print " file not exits,",
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))
            print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    print "    ******  success with time cost = %s." % str(time.time()-tic)
                    # repair don't care about moving the result to SQLite DB
                    #try:
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    #except:
                    #    print "    ****  copy to local failed, retry!"
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    break
                except:
                    # delete the folder if summarization failed.
                    print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                    hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1

        else:
            print " file exists."


        # ############################## #
        # The REGION VIEW hive procedure #
        # ############################## #
        print "    ****  region view tour:"
        # check if the summary has been performed on this particular hour (last hour)
        print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            print " file not exits,",
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
            print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    print "    ******  success with time cost = %s." % str(time.time()-tic)
                    # repair don't care about moving the result to SQLite DB
                    #try:
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    #except:
                    #    print "    ****  copy to local failed, retry!"
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    break
                except:
                    # delete the folder if summarization failed.
                    print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                    hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1
        else:
            print " file exists."
コード例 #19
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts) + " = " + time.strftime(
        'GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts))
    print "###################"
    ts_last_hour = ts - 3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    #hour_list = [str("%02d" % x) for x in range(24)]
    region_summary_retrial_max = 10

    # ############################### #
    # The SUMMARY HOUR hive procedure #
    # ############################### #
    print "    ****  summary hour tour:"
    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(
            os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp),
                         '000000_0.deflate')):
        print " file not exits,",
        f = open(
            os.path.join(config.mrqos_hive_query,
                         'mrqos_region_summarize_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp,
                             datestamp, hourstamp)
        f.close()
        strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

        print " BLN for hourly summary: day = %s, hour = %s. " % (datestamp,
                                                                  hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            tic = time.time()
            try:
                beeline.bln_e(strcmd_s)
                print "    ******  success with time cost = %s." % str(
                    time.time() - tic)
                #try:
                #    beeline.bln_e_output(strcmd_g, query_result_file)
                #except:
                #    print "    ****  copy to local failed, retry!"
                #    beeline.bln_e_output(strcmd_g, query_result_file)
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ******  failed with time cost = %s upto # retrials=%s" % (
                    str(time.time() - tic), str(count_retrial))
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp),
                            r=True)
                count_retrial += 1
    else:
        print " file exists."

    # ############################ #
    # The CASE VIEW hive procedure #
    # ############################ #
    print "    ****  case view tour:"
    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(
            os.path.join(
                config.hdfs_qos_case_view_hour % (datestamp, hourstamp),
                '000000_0.deflate')):
        print " file not exits,",
        f = open(
            os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'),
            'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print " BLN for hourly summary for day = %s, hour = %s." % (datestamp,
                                                                    hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                tic = time.time()
                beeline.bln_e(strcmd_s)
                print "    ******  success with time cost = %s." % str(
                    time.time() - tic)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except sp.CalledProcessError as e:
                    print "    ****  copy to local failed, retry!"
                    print e.message
                    beeline.bln_e_output(strcmd_g, query_result_file)
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ******  failed with time cost = %s upto #retrials=%s" % (
                    str(time.time() - tic), str(count_retrial))
                print e.message
                hdfsutil.rm(config.hdfs_qos_case_view_hour %
                            (datestamp, hourstamp),
                            r=True)
                count_retrial += 1

    else:
        print " file exists."

    # ############################## #
    # The REGION VIEW hive procedure #
    # ############################## #
    print "    ****  region view tour:"
    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(
            os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp),
                         '000000_0.deflate')):
        print " file not exits,",
        f = open(
            os.path.join(config.mrqos_hive_query,
                         'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp,
                             datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print " BLN for hourly summary for day = %s, hour = %s." % (datestamp,
                                                                    hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                tic = time.time()
                beeline.bln_e(strcmd_s)
                print "    ******  success with time cost = %s." % str(
                    time.time() - tic)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except sp.CalledProcessError as e:
                    print "    ****  copy to local failed, retry!"
                    print e.message
                    beeline.bln_e_output(strcmd_g, query_result_file)
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ******  failed with time cost = %s upto #retrials=%s" % (
                    str(time.time() - tic), str(count_retrial))
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour %
                            (datestamp, hourstamp),
                            r=True)
                count_retrial += 1

    else:
        print " file exists."
コード例 #20
0
ファイル: query_jobs.py プロジェクト: YuTengChang/akam_mrqos
def main():
    """  this function will do the query on 5 different measurement and upload
    the data to hdfs accordingly, this also join tables at single time point """

    # different queries (various types)
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']

    sql = """sql2 -q map.mapnoccthree.query.akadns.net --csv "`cat """
    post = """`" | tail -n+3 | awk -F"," 'BEGIN{OFS=","}{$1=""; print $0}' | sed 's/^,//g' > """

    # current time
    timenow = int(time.time())

    print "###################"
    print "Start processing the data back in for 10 minute joins"
    print "starting processing time is " + str(timenow)
    print "###################"

    # fetch the data through query with retrials
    print "    ****  querying mrqos data."
    for item in mtype:
        flag = 0
        count = 0
        dest = os.path.join(config.mrqos_data, item + '.tmp')
        aggs = os.path.join(config.mrqos_query, item + '.qr')

        cmd = sql + aggs + post + dest
        n_retrial = config.query_retrial
        t_timeout = config.query_timeout
        # multiple times with timeout scheme
        while (flag == 0) and (count < n_retrial):
            try:
                with ytt.Timeout(t_timeout):
                    sp.call(cmd, shell=True)
                    flag = 1
            except:
                count += 1
        # if any of the query not fetched successfully, break all and stop running
        if count >= n_retrial:
            print ">> data fetch failed in querying table %s" % item
            return

    # provide SCORE table with peak/off-peak attribute
    print "    ****  provide PEAK in score."
    sp.call([config.provide_peak], shell=True)

    # backup the individual query file by copying to backup folder
    print "    ****  backing up queried results."
    if not os.path.exists(
            '/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow)):
        os.makedirs('/home/testgrp/MRQOS/mrqos_data/backup/%s' % str(timenow))
        for item in mtype:
            filesrc = os.path.join(config.mrqos_data, item + '.tmp')
            filedst = '/home/testgrp/MRQOS/mrqos_data/backup/%s/' % str(
                timenow)
            shutil.copy(filesrc, filedst)

    # upload to hdfs and link to hive tables
    print "    ****  uploading to hdfs and hive."
    try:
        # adding the individual query result to hdfs and add hive partitions
        for item in mtype:
            listname = os.path.join(config.mrqos_data, item + '.tmp')
            hdfs_d = os.path.join(config.hdfs_table, item,
                                  'ts=%s' % str(timenow))
            upload_to_hive(listname, hdfs_d, str(timenow), item)
        shutil.rmtree('/home/testgrp/MRQOS/mrqos_data/backup/%s' %
                      str(timenow))

        # new version of the join tables in hive: direct insert #
        # specify the new joined file in hdfs
        hdfs_file = os.path.join(config.hdfs_table, 'mrqos_join',
                                 'ts=%s' % str(timenow), '000000_0.deflate')
        # specify the local copy of the joined file
        local_file = os.path.join(config.mrqos_data_backup, '000000_0.deflate')
        try:
            print "    ****  direct join and insert into mrqos_join."
            # direct join and insert in hive
            f = open('/home/testgrp/MRQOS/MRQOS_table_join2.hive', 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (str(timenow), str(timenow), str(timenow),
                                 str(timenow), str(timenow), str(timenow))
            f.close()
            print "    ****  perform beeline for join."
            beeline.bln_e(strcmd_s)
            # have the local copy of the joined file
            print "    ****  copy the joined file for backup."
            hdfsutil.get(hdfs_file, local_file)
        except sp.CalledProcessError as e:
            print ">> direct join and insert failed, trying to copy the last succeeded one"
            print e.message
            try:
                # upload the last succeeded one from local
                print "    ****  copying backups from local to hdfs"
                hdfsutil.put(local_file, hdfs_file)
                try:
                    # using hive to add partitions to joined query results
                    print "    ****  adding hive partitions"
                    hiveql_str = 'use mrqos; alter table mrqos_join add partition(ts=%s);' % str(
                        timenow)
                    beeline.bln_e(hiveql_str)
                except sp.CalledProcessError as e:
                    print ">> copying from duplicated file for mrqos_join failed in adding partitions"
                    print e.message
                    #raise HiveCreatePartitionError
            except:
                print "copying from duplicated file for mrqos_join failed in uploading to hdfs"
    except:
        print "HDFS upload failed, backup file retains"

    # clear the expired data in mrqos_table
    print "    ****  clean up mrqos individual table."
    mrqos_table_cleanup()
    # clear the expired data in mrqos_join
    print "    ****  clean up mrqos joined table."
    mrqos_join_cleanup()
コード例 #21
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'cron_region_summary_hour.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # start the logging
    logger.info("###################")
    logger.info("# Performing the hourly mrqos_region summary")
    logger.info("# starting time: " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts)))
    logger.info("###################")

    # parameter: backfilter length
    bf_length = config.region_summary_back_filling
    ts_last_couple_hour_list = [ts-(1+x)*3600 for x in range(bf_length)]

    for ts_last_hour in ts_last_couple_hour_list:
        datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
        hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
        region_summary_retrial_max = 10

        # ############################### #
        # The SUMMARY HOUR hive procedure #
        # ############################### #
        #logger.info("    ****  summary hour tour: checking day = %s, hour = %s." % (datestamp, hourstamp))
        # check if the summary has been performed on this particular hour (last hour)
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')):
            logger.info("** region summary hour: checking day = %s, hour = %s, and file does not exist." % (datestamp,
                                                                                                            hourstamp))
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()

            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                tic = time.time()
                try:
                    beeline.bln_e(strcmd_s)
                    logger.info("BLN region summary hour success @ cost = %s sec." % str(time.time()-tic))
                    break
                except sp.CalledProcessError as e:
                    # delete the folder if summarization failed.
                    logger.info("BLN region summary hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic),
                                                                                                   str(count_retrial)))
                    logger.exception("message")
                    hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1
        else:
            logger.info("** region summary hour: checking day = %s, hour = %s, and file exists." % (datestamp,
                                                                                                    hourstamp))


        # ############################ #
        # The CASE VIEW hive procedure #
        # ############################ #
        #print "    ****  case view tour:"
        # check if the summary has been performed on this particular hour (last hour)
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            logger.info("** case view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp,
                                                                                                       hourstamp))
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))

            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    logger.info("BLN case view hour success @ cost = %s sec." % str(time.time()-tic))
                    try:
                        beeline.bln_e_output(strcmd_g, query_result_file)
                    except sp.CalledProcessError as e:
                        logger.warning("copy to local failed, retrying...")
                        print e.message
                        try:
                            beeline.bln_e_output(strcmd_g, query_result_file)
                        except sp.CalledProcessError as e:
                            logger.error("copy to local failed again, abort.")
                            logger.exception("message")
                    break
                except sp.CalledProcessError as e:
                    # delete the folder if summarization failed.
                    logger.info("BLN case view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic),
                                                                                              str(count_retrial)))
                    logger.exception("message")
                    hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1

        else:
            logger.info("** case view hour: checking day = %s, hour = %s, and file exists." % (datestamp,
                                                                                               hourstamp))



        # ############################## #
        # The REGION VIEW hive procedure #
        # ############################## #
        # check if the summary has been performed on this particular hour (last hour)
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            logger.info("** region view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp,
                                                                                                         hourstamp))
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))

            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    logger.info("BLN region view hour success @ cost = %s sec." % str(time.time()-tic))
                    try:
                        beeline.bln_e_output(strcmd_g, query_result_file)
                    except sp.CalledProcessError as e:
                        logger.warning("copy to local failed, retrying...")
                        print e.message
                        try:
                            beeline.bln_e_output(strcmd_g, query_result_file)
                        except sp.CalledProcessError as e:
                            logger.error("copy to local failed again, abort.")
                            logger.exception("message")
                    break
                except sp.CalledProcessError as e:
                    # delete the folder if summarization failed.
                    logger.info("BLN region view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic),
                                                                                                str(count_retrial)))
                    logger.exception("message")
                    hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1

        else:
            logger.info("** region view hour: checking day = %s, hour = %s, and file exists." % (datestamp,
                                                                                                 hourstamp))
コード例 #22
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts - 3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    hour_list = [x for x in hour_list if x <= hourstamp]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    folders_day = '/'.join(
        str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1])

    # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one
    if hdfsutil.test_dic(folders_day):
        hdfsutil.mkdir(folders_day)

    folders_in = [folders_day + '/hour=%s' % x for x in hour_list]
    folders_out = hdfsutil.ls(folders_day)

    folders_missing = [x for x in folders_in if x not in folders_out]
    folders_missing.sort(reverse=True)

    for item in folders_missing:
        hourstamp = item[-2:]
        print "    ****  missing data for day = %s, hour = %s." % (datestamp,
                                                                   hourstamp),
        f = open(
            os.path.join(config.mrqos_hive_query,
                         'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp,
                             datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (
            datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials=" + str(
                    count_retrial)
                print "    ****  ",
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour %
                            (datestamp, hourstamp),
                            r=True)
                count_retrial += 1