def mrqos_join_cleanupv2(logger):
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    joined_partitions = hdfsutil.ls(config.hdfs_table_join2)
    str_parts_list = [i.split('=', 1)[1] for i in joined_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold
    timenow = int(time.time())

    # get the list of retired data in HDFS using hive partitions
    try:
        hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\
                            if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)]
        try:
            # drop the partitions in hive
            beeline.drop_partitions('mrqos.mrqos_join2', 'ts<%s' % str(timenow-config.mrqos_join_delete))
            logger.info("drop hive partitions successful. ")
            # remove the hdfs folders
            for partition_id in hdfs_remove_list:
                try:
                    hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join2', '%s' % str(partition_id))
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    logger.info('failed to remove HDFS folder for mrqos_join at partition folder %s' % str(partition_id))
            logger.info('remove HDFS successful. ')
        except sp.CalledProcessError as e:
            logger.error('failed to drop partitions. ')
    except sp.CalledProcessError as e:
        logger.error('failed to obtain retire partition list (HIVE)')
        logger.error('error message: %s' % e.message)
Beispiel #2
0
def mrqos_table_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    score_partitions = hdfsutil.ls(config.hdfs_table_score)
    str_parts_list = [i.split('=', 1)[1] for i in score_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs
    timenow = int(time.time())
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']
    for item in mtype:
        exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item)
        str_parts_list = [i.split('=', 1)[1] for i in this_partitions]
        str_parts_list_int = map(int, str_parts_list)
        print "      ##  for table: %s" % item
        print "      ##  ",
        print str_parts_list_int

        for partition in str_parts_list_int:
            if partition < timenow - config.mrqos_table_delete:
                try:
                    print "      ##  handling table: %s with ts=%s" % (item, str(partition))
                    # drop partitions (ok even if partition does not exist)
                    hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(partition)
                    beeline.bln_e(hiveql_str)
                    # remove data from HDFS (ok even if folder in hdfs does not exist)
                    hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition)
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed in hive table clean up in table: %s." % item
                    print e.message
def main():
    """ get the date for the past day (yesterday). """
    timenow = int(time.time())
    datenow = str(datetime.date.today()-datetime.timedelta(1))
    datenow = datenow[0:4]+datenow[5:7]+datenow[8:10]

    print "###################"
    print "# Start processing the data back in " + datenow + " (yesterday)"
    print "# starting processing time is " + str(timenow)
    print "###################"

    ts = calendar.timegm(time.gmtime())
    ts_last_hour = ts-3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))

    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s." % (datestamp),
    if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_day % (datestamp), '000000_0.deflate')):
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_day.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, datestamp, datestamp)
        f.close()
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hourstamp)
        try:
            beeline.bln_e(strcmd_s)
        except:
            # delete the folder if summarization failed.
            print "    ****  summarization failed, removed hdfs folder."
            hdfsutil.rm(config.hdfs_qos_rg_day % (datestamp), r=True)
    else:
        print " file exists."
Beispiel #4
0
def mrqos_join_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    joined_partitions = hdfsutil.ls(config.hdfs_table_join)
    str_parts_list = [i.split('=', 1)[1] for i in joined_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold
    timenow = int(time.time())

    # get the list of retired data in HDFS using hive partitions
    try:
        hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\
                            if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)]
        try:
            # drop the partitions in hive
            beeline.drop_partitions('mrqos.mrqos_join', 'ts<%s' % str(timenow-config.mrqos_join_delete))
            print " drop partitions successful. "
            # remove the hdfs folders
            for partition_id in hdfs_remove_list:
                try:
                    hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join', '%s' % str(partition_id))
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed to remove HDFS folder for mrqos_join at partition folder %s" % str(partition_id)
            print " remove HDFS successful. "
        except sp.CalledProcessError as e:
            print ">> failed to drop partitions"
    except sp.CalledProcessError as e:
        print ">> failed to obtain retire partition list (HIVE)"
        print e.message
Beispiel #5
0
def upload_to_hive(listname, hdfs_d, partition, tablename, logger):
    """ this function will create a partition directory in hdfs with the requisite timestamp. It will
    then add the partition to the table "tablename" with the appropriate "partition" """

    # hdfs_d = config.hdfsclnspp % (ts)
    # create the partition
    try:
        # sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d])
        hdfs.mkdir(hdfs_d)
        logger.info('HDFS directory creation succeeded: %s' % hdfs_d)
        try:
            # sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d])
            hdfs.put(listname, hdfs_d)
            logger.info('HDFS upload succeeded: %s' % listname)
            try:
                hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(%s);' % (partition)
                bln_e(hiveql_str)
                logger.info('add partition (alter table) succeeded %s' % tablename)

            except sp.CalledProcessError as e:
                logger.error('add partition (alter table) failed.')
                logger.error('error: %s' % e.message)
                # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d])
                hdfs.rm(hdfs_d, r=True)

        except sp.CalledProcessError as e:
            logger.error('HDFS upload failed.')
            logger.error('error: %s' % e.message)
            # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d])
            hdfs.rm(hdfs_d, r=True)

    except sp.CalledProcessError as e:
        logger.error('HDFS directory creation failed.')
        logger.error('error: %s' % e.message)
Beispiel #6
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts-3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials="+str(count_retrial)
                hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                count_retrial += 1

    else:
        print " file exists."

    # check if the summary has been performed since the beginning of the day, last check on day X is X+1/0:30:00
    for hour in hour_list:
        if hour < hourstamp:
            print "    ****  checking day = %s, hour = %s." % (datestamp, hour),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hour), '000000_0.deflate')):
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hour, datestamp, hour, datestamp, hour)
                f.close()
                print "    ****  perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hour)
                try:
                    beeline.bln_e(strcmd_s)
                except:
                    # delete the folder if summarization failed.
                    print "    ****  summarization failed, removed hdfs folder."
                    hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hour), r=True)
            else:
                print " file exists."
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts-3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    hour_list = [x for x in hour_list if x <= hourstamp]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    folders_day = '/'.join(str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1])

    # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one
    if hdfsutil.test_dic(folders_day):
        hdfsutil.mkdir(folders_day)

    folders_in = [folders_day+'/hour=%s' % x for x in hour_list]
    folders_out = hdfsutil.ls(folders_day)

    folders_missing = [x for x in folders_in if x not in folders_out]
    folders_missing.sort(reverse=True)

    for item in folders_missing:
        hourstamp = item[-2:]
        print "    ****  missing data for day = %s, hour = %s." % (datestamp, hourstamp),
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials="+str(count_retrial)
                print "    ****  ",
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                count_retrial += 1
def main():
    # initialze the logger
    logging.basicConfig(
        filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    timenow = int(time.time())
    datenow = str(datetime.date.today() - datetime.timedelta(1))
    date_idx = datenow[0:4] + datenow[5:7] + datenow[8:10]

    # get the latest barebone day_idx
    bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1]
    logger.info("barebone index: day={}".format(bb_day_idx))

    # get the latest mpd yesterday
    uuid_list = [
        x.split('=')[-1] for x in hdfsutil.ls(
            os.path.join(os.path.dirname(config.hdfs_table), 'mapper',
                         'mapmon', 'day={}'.format(date_idx)))
    ]
    for uuid_idx in uuid_list:
        logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx))
        file_location = os.path.join(config.hdfs_table, 'mapmon_sum',
                                     'day={}'.format(date_idx),
                                     'mpd_uuid={}'.format(uuid_idx))
        if hdfsutil.test_dic(file_location):
            logger.info('creating folder: {}'.format(file_location))
            hdfsutil.mkdir(file_location)

        if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')):
            f = open(
                os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'),
                'r')
            strcmd = f.read()
            strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx,
                                 uuid_idx, date_idx, uuid_idx)
            f.close()
            try:
                beeline.bln_e(strcmd_s)
            except:
                # delete the folder if summarization failed.
                logger.warn("summarization failed, removing hdfs folder.")
                hdfsutil.rm(file_location, r=True)
        else:
            logger.info(" file exists.")
def cleanup_mrqos_region_related_tables(datestamp, hour):
    tables = ['mrqos_region_hour', 'case_view_hour', 'region_view_hour']
    for table_item in tables:
        try:
            # drop partitions (ok even if partition does not exist)
            hiveql_str = 'use mrqos; alter table %s drop if exists partition(datestamp=%s, hour=%s)' % (table_item,
                                                                                                        str(datestamp),
                                                                                                        str(hour))
            beeline.bln_e(hiveql_str)
            # remove data from HDFS (ok even if folder in hdfs does not exist)
            hdfs_d = os.path.join(config.hdfs_table, table_item, 'datestamp=%s' % str(datestamp), 'hour=%s' % str(hour))
            hdfsutil.rm(hdfs_d, r=True)
        except sp.CalledProcessError:
            print ">> failed in hive table clean up in table: %s for partition datestamp=%s, hour=%s." % (table_item,
                                                                                                          str(datestamp),
                                                                                                          str(hour))
            pass
def main():
    # initialze the logger
    logging.basicConfig(filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    timenow = int(time.time())
    datenow = str(datetime.date.today()-datetime.timedelta(1))
    date_idx = datenow[0:4]+datenow[5:7]+datenow[8:10]

    # get the latest barebone day_idx
    bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1]
    logger.info("barebone index: day={}".format(bb_day_idx))

    # get the latest mpd yesterday
    uuid_list = [x.split('=')[-1] for x in hdfsutil.ls(os.path.join(os.path.dirname(config.hdfs_table),'mapper','mapmon','day={}'.format(date_idx)))]
    for uuid_idx in uuid_list:
        logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx))
        file_location = os.path.join(config.hdfs_table,
                                           'mapmon_sum',
                                           'day={}'.format(date_idx),
                                           'mpd_uuid={}'.format(uuid_idx))
        if hdfsutil.test_dic(file_location):
            logger.info('creating folder: {}'.format(file_location))
            hdfsutil.mkdir(file_location)


        if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')):
            f = open(os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx,
                                 date_idx, uuid_idx,
                                 date_idx, uuid_idx)
            f.close()
            try:
                beeline.bln_e(strcmd_s)
            except:
                # delete the folder if summarization failed.
                logger.warn("summarization failed, removing hdfs folder.")
                hdfsutil.rm(file_location, r=True)
        else:
            logger.info(" file exists.")
Beispiel #11
0
def upload_to_hive(listname, hdfs_d, partition, tablename, logger):
    """ this function will create a partition directory in hdfs with the requisite timestamp. It will
    then add the partition to the table "tablename" with the appropriate "partition" """

    # hdfs_d = config.hdfsclnspp % (ts)
    # create the partition
    try:
        # sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d])
        hdfs.mkdir(hdfs_d)
        logger.info('HDFS directory creation succeeded: %s' % hdfs_d)
        try:
            # sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d])
            hdfs.put(listname, hdfs_d)
            logger.info('HDFS upload succeeded: %s' % listname)
            try:
                hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(%s);' % (
                    partition)
                bln_e(hiveql_str)
                logger.info('add partition (alter table) succeeded %s' %
                            tablename)

            except sp.CalledProcessError as e:
                logger.error('add partition (alter table) failed.')
                logger.error('error: %s' % e.message)
                # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d])
                hdfs.rm(hdfs_d, r=True)

        except sp.CalledProcessError as e:
            logger.error('HDFS upload failed.')
            logger.error('error: %s' % e.message)
            # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d])
            hdfs.rm(hdfs_d, r=True)

    except sp.CalledProcessError as e:
        logger.error('HDFS directory creation failed.')
        logger.error('error: %s' % e.message)
Beispiel #12
0
def mrqos_table_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    score_partitions = hdfsutil.ls(config.hdfs_table_score)
    str_parts_list = [i.split('=', 1)[1] for i in score_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs
    timenow = int(time.time())
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']
    for item in mtype:
        exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item)
        str_parts_list = [i.split('=', 1)[1] for i in this_partitions]
        str_parts_list_int = map(int, str_parts_list)
        print "      ##  for table: %s" % item
        print "      ##  ",
        print str_parts_list_int

        for partition in str_parts_list_int:
            if partition < timenow - config.mrqos_table_delete:
                try:
                    print "      ##  handling table: %s with ts=%s" % (
                        item, str(partition))
                    # drop partitions (ok even if partition does not exist)
                    hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(
                        partition)
                    beeline.bln_e(hiveql_str)
                    # remove data from HDFS (ok even if folder in hdfs does not exist)
                    hdfs_d = os.path.join(config.hdfs_table, item,
                                          'ts=%s' % partition)
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed in hive table clean up in table: %s." % item
                    print e.message
def main():
    # logging set-up
    logging.basicConfig(
        filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    # ##############################

    ts = int(time.time())
    ts_timeout = ts - config.mrqos_table_delete * 24 * 3  # 3 days = (24*3) hours of time-out

    date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout)))
    # hourstamp = time.strftime('%H', time.gmtime(float(ts)))

    list_to_clean = sorted(
        list(
            set([
                x.split('/')[0] for x in beeline.show_partitions(
                    'mrqos.mrqos_region').split('\n')
            ])))
    list_to_clean = [
        x for x in list_to_clean
        if ('=' in x and x.split('=')[1] < date_timeout)
    ]

    logger.info('handling table: mrqos_region')
    try:
        logger.info('removing the data in HDFS')
        # remove the hdfs folder
        for item in list_to_clean:
            hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_region',
                                     '%s' % item),
                        r=True)

        # alter the hive table: mrqos_region
        try:
            logger.info('drop partitions, condition: datestamp<%s' %
                        str(date_timeout))
            beeline.drop_partitions(tablename='mrqos.mrqos_region',
                                    condition='datestamp<%s' %
                                    str(date_timeout))
        except sp.CalledProcessError as e:
            logger.error('drop partition failed')
            logger.error('error: %s' % e.message)

    except sp.CalledProcessError as e:
        logger.error('removed data from hdfs failed')
        logger.error('error: %s' % e.message)

    # ##############################
    # target table: maprule_info, mcm_machines
    # ##############################

    query_item = ['maprule_info', 'mcm_machines']

    for scan in query_item:
        logger.info('handling table: %s' % scan)
        list_to_clean = sorted(
            list(
                set([
                    x.split('/')[0]
                    for x in beeline.show_partitions('mrqos.%s' %
                                                     scan).split('\n')
                ])))
        list_to_clean = [
            x for x in list_to_clean
            if ('=' in x and int(x.split('=')[1]) < ts_timeout)
        ]

        try:
            logger.info('removing the data in HDFS')
            # remove the hdfs folder
            for item in list_to_clean:
                hdfsutil.rm(os.path.join(config.hdfs_table, '%s' % scan,
                                         '%s' % item),
                            r=True)

            # alter the hive table: mrqos_region
            try:
                logger.info('drop partitions, condition: ts<%s' %
                            str(ts_timeout))
                beeline.drop_partitions(tablename='mrqos.%s' % scan,
                                        condition='ts<%s' % str(ts_timeout))
            except sp.CalledProcessError as e:
                logger.error('drop partition failed')
                logger.error('error: %s' % e.message)

        except sp.CalledProcessError as e:
            logger.error('removed data from hdfs failed')
            logger.error('error: %s' % e.message)
Beispiel #14
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts - 3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    hour_list = [x for x in hour_list if x <= hourstamp]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    folders_day = '/'.join(
        str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1])

    # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one
    if hdfsutil.test_dic(folders_day):
        hdfsutil.mkdir(folders_day)

    folders_in = [folders_day + '/hour=%s' % x for x in hour_list]
    folders_out = hdfsutil.ls(folders_day)

    folders_missing = [x for x in folders_in if x not in folders_out]
    folders_missing.sort(reverse=True)

    for item in folders_missing:
        hourstamp = item[-2:]
        print "    ****  missing data for day = %s, hour = %s." % (datestamp,
                                                                   hourstamp),
        f = open(
            os.path.join(config.mrqos_hive_query,
                         'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp,
                             datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (
            datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials=" + str(
                    count_retrial)
                print "    ****  ",
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour %
                            (datestamp, hourstamp),
                            r=True)
                count_retrial += 1
def main():
    # parameters
    # RAinput='/home/testgrp/RAAnalysis/ra_data/ra_msg/assignments_agg'

    # current time
    timenow = int(time.time())
    # #### RA PART ####
    for ra_concat_file in glob.glob( os.path.join(config.RAconcat,'*.txt') ):
        infoitem = ra_concat_file.split('.')
        datestamp = infoitem[1]
        UUID = infoitem[2]
        STARTTIME = infoitem[3]
        ENDTIME = infoitem[4]
        print 'uuid=%s, starttime=%s, endtime=%s, datestamp=%s' % (UUID, STARTTIME, ENDTIME, datestamp)

        # upload ra_concat_file to HDFS
        print '*** uploading file to HDFS ' + ra_concat_file
        try:
            sp.check_call(['hadoop', 'fs', '-put', ra_concat_file, config.hdfs_ra_intermediate])
            sp.check_call(['rm', ra_concat_file])
            intermediate_file_name = ra_concat_file.split('/')[-1]
        except:
            print 'HDFS file upload error'
            # still remove the local file (keeps from cumulating the concatenated files)
            sp.check_call(['rm', ra_concat_file])
            continue # check the next ra_concat_file

        # create corresponding HDFS directory
        # PIG will create the HDFS in the designated folder

        # run PIG script to utilize AVRO
        # example: HADOOP_USER_NAME=akamai; pig11 -p datestamp=20151201 -p uuid=0e0bda82-9823-11e5-b44e-300ed5c5f881 -p ts=1448980818 /home/testgrp/RAAnalysis/pig/csv_to_avro.pig
        print '*** pig serializes the data into HDFS for file ' + ra_concat_file
        cmd = '%s; %s -p datestamp=%s -p uuid=%s -p ts=%s %s; %s' % ( config.cmd_hadoop_user_akamai,
                                                                      config.cmd_pig11,
                                                                      datestamp,
                                                                      UUID,
                                                                      STARTTIME,
                                                                      config.csv_to_avro_pig_script,
                                                                      config.cmd_hadoop_user_testgrp )
        #print cmd
        try:
            print 'try the pig script...'
            sp.check_call( cmd, shell=True )
            # pig log cleanup _log directory and _SUCCESS file when successful
            this_ra_temp_hdfs_location = config.hdfs_ra_temp % (datestamp,
                                                                UUID,
                                                                STARTTIME)
            this_ra_map_hdfs_location = config.hdfs_ra_map % (datestamp,
                                                              UUID,
                                                              STARTTIME)

            # copy the file from ramap [PIG OUTPUT] to RA_map folder [HIVE]
            print 'copy the file to RA_map folder'
            print 'HDFS copy RA-avro fail' if hdfs.cp( this_ra_temp_hdfs_location+'/part-r-00000.avro',
                                                       this_ra_map_hdfs_location) else 'HDFS copy RA-avro success'

            # remove the remainder in ramap [PIG output] folder (not fully clear yet)
            print 'remove the remainder in the ramap folder'
            cmd = '%s; hadoop fs -rm -r %s; %s' % (config.cmd_hadoop_user_akamai,
                                                   this_ra_temp_hdfs_location,
                                                   config.cmd_hadoop_user_testgrp)
            sp.check_call( cmd, shell=True )
            #cmd = '%s; hadoop fs -rm %s/_SUCCESS' % (config.cmd_hadoop_user_change,
            #                                         this_ra_map_hdfs_location)
            #sp.check_call( cmd, shell=True )

            # remove the remainder in the RA_pre_Avro folder
            print 'intermediate_file_name = ' + intermediate_file_name
            hdfs.rm( config.hdfs_ra_intermediate+'/'+intermediate_file_name )

            # update the HIVE table
            cmd = "hive -e 'use raana; MSCK REPAIR TABLE ra_map;'"
            sp.check_call( cmd, shell=True )

        except:
            print 'PIG script Error.'
Beispiel #16
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'cron_region_summary_hour.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # start the logging
    logger.info("###################")
    logger.info("# Performing the hourly mrqos_region summary")
    logger.info("# starting time: " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts)))
    logger.info("###################")

    # parameter: backfilter length
    bf_length = config.region_summary_back_filling
    ts_last_couple_hour_list = [ts-(1+x)*3600 for x in range(bf_length)]

    for ts_last_hour in ts_last_couple_hour_list:
        datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
        hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
        region_summary_retrial_max = 10

        # ############################### #
        # The SUMMARY HOUR hive procedure #
        # ############################### #
        #logger.info("    ****  summary hour tour: checking day = %s, hour = %s." % (datestamp, hourstamp))
        # check if the summary has been performed on this particular hour (last hour)
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')):
            logger.info("** region summary hour: checking day = %s, hour = %s, and file does not exist." % (datestamp,
                                                                                                            hourstamp))
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()

            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                tic = time.time()
                try:
                    beeline.bln_e(strcmd_s)
                    logger.info("BLN region summary hour success @ cost = %s sec." % str(time.time()-tic))
                    break
                except sp.CalledProcessError as e:
                    # delete the folder if summarization failed.
                    logger.info("BLN region summary hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic),
                                                                                                   str(count_retrial)))
                    logger.exception("message")
                    hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1
        else:
            logger.info("** region summary hour: checking day = %s, hour = %s, and file exists." % (datestamp,
                                                                                                    hourstamp))


        # ############################ #
        # The CASE VIEW hive procedure #
        # ############################ #
        #print "    ****  case view tour:"
        # check if the summary has been performed on this particular hour (last hour)
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            logger.info("** case view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp,
                                                                                                       hourstamp))
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))

            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    logger.info("BLN case view hour success @ cost = %s sec." % str(time.time()-tic))
                    try:
                        beeline.bln_e_output(strcmd_g, query_result_file)
                    except sp.CalledProcessError as e:
                        logger.warning("copy to local failed, retrying...")
                        print e.message
                        try:
                            beeline.bln_e_output(strcmd_g, query_result_file)
                        except sp.CalledProcessError as e:
                            logger.error("copy to local failed again, abort.")
                            logger.exception("message")
                    break
                except sp.CalledProcessError as e:
                    # delete the folder if summarization failed.
                    logger.info("BLN case view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic),
                                                                                              str(count_retrial)))
                    logger.exception("message")
                    hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1

        else:
            logger.info("** case view hour: checking day = %s, hour = %s, and file exists." % (datestamp,
                                                                                               hourstamp))



        # ############################## #
        # The REGION VIEW hive procedure #
        # ############################## #
        # check if the summary has been performed on this particular hour (last hour)
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            logger.info("** region view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp,
                                                                                                         hourstamp))
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))

            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    logger.info("BLN region view hour success @ cost = %s sec." % str(time.time()-tic))
                    try:
                        beeline.bln_e_output(strcmd_g, query_result_file)
                    except sp.CalledProcessError as e:
                        logger.warning("copy to local failed, retrying...")
                        print e.message
                        try:
                            beeline.bln_e_output(strcmd_g, query_result_file)
                        except sp.CalledProcessError as e:
                            logger.error("copy to local failed again, abort.")
                            logger.exception("message")
                    break
                except sp.CalledProcessError as e:
                    # delete the folder if summarization failed.
                    logger.info("BLN region view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic),
                                                                                                str(count_retrial)))
                    logger.exception("message")
                    hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1

        else:
            logger.info("** region view hour: checking day = %s, hour = %s, and file exists." % (datestamp,
                                                                                                 hourstamp))
def main():
    # logging set-up
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_join.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting

    ts = int(time.time())
    logger.info('########### ts=%s ###########' % str(ts))
    #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts)))
    #hourstamp = time.strftime('%H', time.gmtime(float(ts)))

    # IO-Ratio Join:
    last_mrqos_region_partition = beeline.get_last_partitions('mrqos.mrqos_region')
    [datestamp, hourstamp, ts_region] = [x.split('=')[1] for x in last_mrqos_region_partition.split('/')]
    logger.info('MRQOS mrqos_region partition: datestamp=%s, hour=%s, ts_region=%s' % (datestamp,
                                                                                 hourstamp,
                                                                                 ts_region))

    mapruleinfo_partitions = [x for x in sorted(beeline.show_partitions('mrqos.maprule_info').split('\n'),reverse=True) if '=' in x]
    mapruleinfo_partitions = [x for x in mapruleinfo_partitions if x < 'ts=%s' % ts_region]
    ts_mapruleinfo = mapruleinfo_partitions[0].split('=')[1]
    logger.info('MRQOS maprule_info partition: ts_mapruleinfo=%s' % ts_mapruleinfo)

    region_summary_retrial_max = 10

    # ############################### #
    # The In-Out Ratio hive procedure #
    # ############################### #
    # check if the summary has been performed on this particular hour (last hour)
    # print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(os.path.join(config.hdfs_table,
                                       'mrqos_ioratio',
                                       'datestamp=%s' % datestamp,
                                       'hour=%s' % hourstamp,
                                       'ts=%s' % ts_region,
                                       '000000_0.deflate')):
        logger.info(' Joined file not exist.')
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_ioratio.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, ts_region,
                             datestamp, hourstamp, ts_region,
                             ts_mapruleinfo)
        print strcmd_s
        f.close()
        # strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        # query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

        print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            tic = time.time()
            try:
                beeline.bln_e(strcmd_s)
                logger.info('    ******  success with time cost = %s.' % str(time.time()-tic))
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                logger.error('    ******  failed with time cost = %s upto # retrials=%s' % (str(time.time()-tic), str(count_retrial)))
                logger.error('error %s' % e.message)
                hdfsutil.rm(os.path.join(config.hdfs_table,
                                         'mrqos_ioratio',
                                         'datestamp=%s' % datestamp,
                                         'hour=%s' % hourstamp,
                                         'ts=%s' % ts_region), r=True)
                count_retrial += 1
    else:
        logger.info(' Joined file exists.')
def main(argv):
    """ get the date and hour for the specified day and hour. Clean(drop) and rebuild the table partition. """
    try:
        opts, args = getopt.getopt(argv,"qd:h:",["datestamp=","hour="])
    except getopt.GetoptError:
        print 'region_summary_hour_repair.py -d <datestamp> -h <hour>'
        sys.exit(2)

    hour =''
    datestamp = ''

    for opt, arg in opts:
        if opt == '-q':
            print 'region_summary_hour_repair.py -d <datestamp> -h <hour>'
            sys.exit()
        elif opt in ("-d", "--datestamp"):
            datestamp = arg
        elif opt in ("-h", "--hour"):
            hour = arg

    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the repair of the  mrqos_region summary"
    print "# starting processing time is " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts))
    print "###################"

    if (not datestamp and not hour):
        print 'region_summary_hour_repair.py -d <datestamp> -h <hour>'
        sys.exit(2)

    print 'Fixing datestamp = %s' % datestamp

    if not hour:
        hour_list = [str("%02d" % x) for x in range(24)]
        print 'Fixing hour = %s' % hour_list
    else:
        print 'Fixing hour = %s' % hour

    #ts_last_hour = ts-3600
    #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    #hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    #hour_list = [str("%02d" % x) for x in range(24)]
    region_summary_retrial_max = 10


    print "    #**** first perform table cleanups: "
    if not hour:
        for hourstamp in hour_list:
            cleanup_mrqos_region_related_tables(datestamp, hourstamp)
    else:
        hourstamp = hour
        cleanup_mrqos_region_related_tables(datestamp, hourstamp)

    print "    #**** rebuild the db / table: "
    if not hour:
        for hourstamp in hour_list:
            # ############################### #
            # The SUMMARY HOUR hive procedure #
            # ############################### #
            print "    ****  summary hour tour:"
            # check if the summary has been performed on this particular hour (last hour)
            print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')):
                print " file not exits,",
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
                f.close()
                strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp)
                query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

                print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp)
                count_retrial = 0
                while count_retrial < region_summary_retrial_max:
                    tic = time.time()
                    try:
                        beeline.bln_e(strcmd_s)
                        print "    ******  success with time cost = %s." % str(time.time()-tic)
                        #try:
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        #except:
                        #    print "    ****  copy to local failed, retry!"
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        break
                    except:
                        # delete the folder if summarization failed.
                        print "    ******  failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial))
                        hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True)
                        count_retrial += 1
            else:
                print " file exists."


            # ############################ #
            # The CASE VIEW hive procedure #
            # ############################ #
            print "    ****  case view tour:"
            # check if the summary has been performed on this particular hour (last hour)
            print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
                print " file not exits,",
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
                f.close()
                strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
                query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))
                print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
                count_retrial = 0
                while count_retrial < region_summary_retrial_max:
                    try:
                        tic = time.time()
                        beeline.bln_e(strcmd_s)
                        print "    ******  success with time cost = %s." % str(time.time()-tic)
                        # repair don't care about moving the result to SQLite DB
                        #try:
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        #except:
                        #    print "    ****  copy to local failed, retry!"
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        break
                    except:
                        # delete the folder if summarization failed.
                        print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                        hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True)
                        count_retrial += 1

            else:
                print " file exists."


            # ############################## #
            # The REGION VIEW hive procedure #
            # ############################## #
            print "    ****  region view tour:"
            # check if the summary has been performed on this particular hour (last hour)
            print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
            if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
                print " file not exits,",
                f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
                strcmd = f.read()
                strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
                f.close()
                strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
                query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
                print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
                count_retrial = 0
                while count_retrial < region_summary_retrial_max:
                    try:
                        tic = time.time()
                        beeline.bln_e(strcmd_s)
                        print "    ******  success with time cost = %s." % str(time.time()-tic)
                        # repair don't care about moving the result to SQLite DB
                        #try:
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        #except:
                        #    print "    ****  copy to local failed, retry!"
                        #    beeline.bln_e_output(strcmd_g, query_result_file)
                        break
                    except:
                        # delete the folder if summarization failed.
                        print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                        hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                        count_retrial += 1
            else:
                print " file exists."

    else:
        # ############################### #
        # The SUMMARY HOUR hive procedure #
        # ############################### #
        print "    ****  summary hour tour:"
        # check if the summary has been performed on this particular hour (last hour)
        print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')):
            print " file not exits,",
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

            print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp)
            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                tic = time.time()
                try:
                    beeline.bln_e(strcmd_s)
                    print "    ******  success with time cost = %s." % str(time.time()-tic)
                    #try:
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    #except:
                    #    print "    ****  copy to local failed, retry!"
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    break
                except:
                    # delete the folder if summarization failed.
                    print "    ******  failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial))
                    hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1
        else:
            print " file exists."


        # ############################ #
        # The CASE VIEW hive procedure #
        # ############################ #
        print "    ****  case view tour:"
        # check if the summary has been performed on this particular hour (last hour)
        print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            print " file not exits,",
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))
            print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    print "    ******  success with time cost = %s." % str(time.time()-tic)
                    # repair don't care about moving the result to SQLite DB
                    #try:
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    #except:
                    #    print "    ****  copy to local failed, retry!"
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    break
                except:
                    # delete the folder if summarization failed.
                    print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                    hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1

        else:
            print " file exists."


        # ############################## #
        # The REGION VIEW hive procedure #
        # ############################## #
        print "    ****  region view tour:"
        # check if the summary has been performed on this particular hour (last hour)
        print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
        if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')):
            print " file not exits,",
            f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
            f.close()
            strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
            query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
            print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
            count_retrial = 0
            while count_retrial < region_summary_retrial_max:
                try:
                    tic = time.time()
                    beeline.bln_e(strcmd_s)
                    print "    ******  success with time cost = %s." % str(time.time()-tic)
                    # repair don't care about moving the result to SQLite DB
                    #try:
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    #except:
                    #    print "    ****  copy to local failed, retry!"
                    #    beeline.bln_e_output(strcmd_g, query_result_file)
                    break
                except:
                    # delete the folder if summarization failed.
                    print "    ******  failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial))
                    hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                    count_retrial += 1
        else:
            print " file exists."
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts) + " = " + time.strftime(
        'GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts))
    print "###################"
    ts_last_hour = ts - 3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    #hour_list = [str("%02d" % x) for x in range(24)]
    region_summary_retrial_max = 10

    # ############################### #
    # The SUMMARY HOUR hive procedure #
    # ############################### #
    print "    ****  summary hour tour:"
    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(
            os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp),
                         '000000_0.deflate')):
        print " file not exits,",
        f = open(
            os.path.join(config.mrqos_hive_query,
                         'mrqos_region_summarize_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp,
                             datestamp, hourstamp)
        f.close()
        strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp))

        print " BLN for hourly summary: day = %s, hour = %s. " % (datestamp,
                                                                  hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            tic = time.time()
            try:
                beeline.bln_e(strcmd_s)
                print "    ******  success with time cost = %s." % str(
                    time.time() - tic)
                #try:
                #    beeline.bln_e_output(strcmd_g, query_result_file)
                #except:
                #    print "    ****  copy to local failed, retry!"
                #    beeline.bln_e_output(strcmd_g, query_result_file)
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ******  failed with time cost = %s upto # retrials=%s" % (
                    str(time.time() - tic), str(count_retrial))
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp),
                            r=True)
                count_retrial += 1
    else:
        print " file exists."

    # ############################ #
    # The CASE VIEW hive procedure #
    # ############################ #
    print "    ****  case view tour:"
    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(
            os.path.join(
                config.hdfs_qos_case_view_hour % (datestamp, hourstamp),
                '000000_0.deflate')):
        print " file not exits,",
        f = open(
            os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'),
            'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'case_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print " BLN for hourly summary for day = %s, hour = %s." % (datestamp,
                                                                    hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                tic = time.time()
                beeline.bln_e(strcmd_s)
                print "    ******  success with time cost = %s." % str(
                    time.time() - tic)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except sp.CalledProcessError as e:
                    print "    ****  copy to local failed, retry!"
                    print e.message
                    beeline.bln_e_output(strcmd_g, query_result_file)
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ******  failed with time cost = %s upto #retrials=%s" % (
                    str(time.time() - tic), str(count_retrial))
                print e.message
                hdfsutil.rm(config.hdfs_qos_case_view_hour %
                            (datestamp, hourstamp),
                            r=True)
                count_retrial += 1

    else:
        print " file exists."

    # ############################## #
    # The REGION VIEW hive procedure #
    # ############################## #
    print "    ****  region view tour:"
    # check if the summary has been performed on this particular hour (last hour)
    print "    ****  checking day = %s, hour = %s." % (datestamp, hourstamp),
    if hdfsutil.test_file(
            os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp),
                         '000000_0.deflate')):
        print " file not exits,",
        f = open(
            os.path.join(config.mrqos_hive_query,
                         'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp,
                             datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print " BLN for hourly summary for day = %s, hour = %s." % (datestamp,
                                                                    hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                tic = time.time()
                beeline.bln_e(strcmd_s)
                print "    ******  success with time cost = %s." % str(
                    time.time() - tic)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except sp.CalledProcessError as e:
                    print "    ****  copy to local failed, retry!"
                    print e.message
                    beeline.bln_e_output(strcmd_g, query_result_file)
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ******  failed with time cost = %s upto #retrials=%s" % (
                    str(time.time() - tic), str(count_retrial))
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour %
                            (datestamp, hourstamp),
                            r=True)
                count_retrial += 1

    else:
        print " file exists."
def main():
    # logging set-up
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # ##############################
    # start the script
    # parameter setting
    # ##############################

    ts = int(time.time())
    ts_timeout = ts - config.mrqos_table_delete * 24 * 3 # 3 days = (24*3) hours of time-out

    date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout)))
    # hourstamp = time.strftime('%H', time.gmtime(float(ts)))

    list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.mrqos_region').split('\n')])))
    list_to_clean = [x for x in list_to_clean if ('=' in x and x.split('=')[1] < date_timeout)]

    logger.info('handling table: mrqos_region')
    try:
        logger.info('removing the data in HDFS')
        # remove the hdfs folder
        for item in list_to_clean:
            hdfsutil.rm(os.path.join(config.hdfs_table,
                                     'mrqos_region',
                                     '%s' % item),
                        r=True)

        # alter the hive table: mrqos_region
        try:
            logger.info('drop partitions, condition: datestamp<%s' % str(date_timeout))
            beeline.drop_partitions(tablename='mrqos.mrqos_region',
                                    condition='datestamp<%s' % str(date_timeout))
        except sp.CalledProcessError as e:
            logger.error('drop partition failed')
            logger.error('error: %s' % e.message)

    except sp.CalledProcessError as e:
        logger.error('removed data from hdfs failed')
        logger.error('error: %s' % e.message)

    # ##############################
    # target table: maprule_info, mcm_machines
    # ##############################

    query_item = ['maprule_info', 'mcm_machines']

    for scan in query_item:
        logger.info('handling table: %s' % scan)
        list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.%s' % scan).split('\n')])))
        list_to_clean = [x for x in list_to_clean if ('=' in x and int(x.split('=')[1]) < ts_timeout)]

        try:
            logger.info('removing the data in HDFS')
            # remove the hdfs folder
            for item in list_to_clean:
                hdfsutil.rm(os.path.join(config.hdfs_table,
                                         '%s' % scan,
                                         '%s' % item),
                            r=True)

            # alter the hive table: mrqos_region
            try:
                logger.info('drop partitions, condition: ts<%s' % str(ts_timeout))
                beeline.drop_partitions(tablename='mrqos.%s' % scan,
                                        condition='ts<%s' % str(ts_timeout))
            except sp.CalledProcessError as e:
                logger.error('drop partition failed')
                logger.error('error: %s' % e.message)

        except sp.CalledProcessError as e:
            logger.error('removed data from hdfs failed')
            logger.error('error: %s' % e.message)