Exemple #1
0
def mrqos_table_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    score_partitions = hdfsutil.ls(config.hdfs_table_score)
    str_parts_list = [i.split('=', 1)[1] for i in score_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs
    timenow = int(time.time())
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']
    for item in mtype:
        exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item)
        str_parts_list = [i.split('=', 1)[1] for i in this_partitions]
        str_parts_list_int = map(int, str_parts_list)
        print "      ##  for table: %s" % item
        print "      ##  ",
        print str_parts_list_int

        for partition in str_parts_list_int:
            if partition < timenow - config.mrqos_table_delete:
                try:
                    print "      ##  handling table: %s with ts=%s" % (item, str(partition))
                    # drop partitions (ok even if partition does not exist)
                    hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(partition)
                    beeline.bln_e(hiveql_str)
                    # remove data from HDFS (ok even if folder in hdfs does not exist)
                    hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition)
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed in hive table clean up in table: %s." % item
                    print e.message
Exemple #2
0
def mrqos_join_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    joined_partitions = hdfsutil.ls(config.hdfs_table_join)
    str_parts_list = [i.split('=', 1)[1] for i in joined_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold
    timenow = int(time.time())

    # get the list of retired data in HDFS using hive partitions
    try:
        hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\
                            if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)]
        try:
            # drop the partitions in hive
            beeline.drop_partitions('mrqos.mrqos_join', 'ts<%s' % str(timenow-config.mrqos_join_delete))
            print " drop partitions successful. "
            # remove the hdfs folders
            for partition_id in hdfs_remove_list:
                try:
                    hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join', '%s' % str(partition_id))
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed to remove HDFS folder for mrqos_join at partition folder %s" % str(partition_id)
            print " remove HDFS successful. "
        except sp.CalledProcessError as e:
            print ">> failed to drop partitions"
    except sp.CalledProcessError as e:
        print ">> failed to obtain retire partition list (HIVE)"
        print e.message
def mrqos_join_cleanupv2(logger):
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    joined_partitions = hdfsutil.ls(config.hdfs_table_join2)
    str_parts_list = [i.split('=', 1)[1] for i in joined_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold
    timenow = int(time.time())

    # get the list of retired data in HDFS using hive partitions
    try:
        hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\
                            if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)]
        try:
            # drop the partitions in hive
            beeline.drop_partitions('mrqos.mrqos_join2', 'ts<%s' % str(timenow-config.mrqos_join_delete))
            logger.info("drop hive partitions successful. ")
            # remove the hdfs folders
            for partition_id in hdfs_remove_list:
                try:
                    hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join2', '%s' % str(partition_id))
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    logger.info('failed to remove HDFS folder for mrqos_join at partition folder %s' % str(partition_id))
            logger.info('remove HDFS successful. ')
        except sp.CalledProcessError as e:
            logger.error('failed to drop partitions. ')
    except sp.CalledProcessError as e:
        logger.error('failed to obtain retire partition list (HIVE)')
        logger.error('error message: %s' % e.message)
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts-3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    hour_list = [x for x in hour_list if x <= hourstamp]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    folders_day = '/'.join(str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1])

    # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one
    if hdfsutil.test_dic(folders_day):
        hdfsutil.mkdir(folders_day)

    folders_in = [folders_day+'/hour=%s' % x for x in hour_list]
    folders_out = hdfsutil.ls(folders_day)

    folders_missing = [x for x in folders_in if x not in folders_out]
    folders_missing.sort(reverse=True)

    for item in folders_missing:
        hourstamp = item[-2:]
        print "    ****  missing data for day = %s, hour = %s." % (datestamp, hourstamp),
        f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp)
        query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials="+str(count_retrial)
                print "    ****  ",
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True)
                count_retrial += 1
def main():
    # initialze the logger
    logging.basicConfig(
        filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'),
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    timenow = int(time.time())
    datenow = str(datetime.date.today() - datetime.timedelta(1))
    date_idx = datenow[0:4] + datenow[5:7] + datenow[8:10]

    # get the latest barebone day_idx
    bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1]
    logger.info("barebone index: day={}".format(bb_day_idx))

    # get the latest mpd yesterday
    uuid_list = [
        x.split('=')[-1] for x in hdfsutil.ls(
            os.path.join(os.path.dirname(config.hdfs_table), 'mapper',
                         'mapmon', 'day={}'.format(date_idx)))
    ]
    for uuid_idx in uuid_list:
        logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx))
        file_location = os.path.join(config.hdfs_table, 'mapmon_sum',
                                     'day={}'.format(date_idx),
                                     'mpd_uuid={}'.format(uuid_idx))
        if hdfsutil.test_dic(file_location):
            logger.info('creating folder: {}'.format(file_location))
            hdfsutil.mkdir(file_location)

        if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')):
            f = open(
                os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'),
                'r')
            strcmd = f.read()
            strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx,
                                 uuid_idx, date_idx, uuid_idx)
            f.close()
            try:
                beeline.bln_e(strcmd_s)
            except:
                # delete the folder if summarization failed.
                logger.warn("summarization failed, removing hdfs folder.")
                hdfsutil.rm(file_location, r=True)
        else:
            logger.info(" file exists.")
def main():
    # initialze the logger
    logging.basicConfig(filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'),
                        level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    timenow = int(time.time())
    datenow = str(datetime.date.today()-datetime.timedelta(1))
    date_idx = datenow[0:4]+datenow[5:7]+datenow[8:10]

    # get the latest barebone day_idx
    bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1]
    logger.info("barebone index: day={}".format(bb_day_idx))

    # get the latest mpd yesterday
    uuid_list = [x.split('=')[-1] for x in hdfsutil.ls(os.path.join(os.path.dirname(config.hdfs_table),'mapper','mapmon','day={}'.format(date_idx)))]
    for uuid_idx in uuid_list:
        logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx))
        file_location = os.path.join(config.hdfs_table,
                                           'mapmon_sum',
                                           'day={}'.format(date_idx),
                                           'mpd_uuid={}'.format(uuid_idx))
        if hdfsutil.test_dic(file_location):
            logger.info('creating folder: {}'.format(file_location))
            hdfsutil.mkdir(file_location)


        if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')):
            f = open(os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r')
            strcmd = f.read()
            strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx,
                                 date_idx, uuid_idx,
                                 date_idx, uuid_idx)
            f.close()
            try:
                beeline.bln_e(strcmd_s)
            except:
                # delete the folder if summarization failed.
                logger.warn("summarization failed, removing hdfs folder.")
                hdfsutil.rm(file_location, r=True)
        else:
            logger.info(" file exists.")
Exemple #7
0
def mrqos_table_cleanup():
    """ when called, this function will delete all partitions
        the clnspp table as long as it is older than the threshold """

    # get the lowest partition by checking the HDFS folders
    score_partitions = hdfsutil.ls(config.hdfs_table_score)
    str_parts_list = [i.split('=', 1)[1] for i in score_partitions]
    str_parts_list_int = map(int, str_parts_list)

    # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs
    timenow = int(time.time())
    mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load']
    for item in mtype:
        exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item)
        str_parts_list = [i.split('=', 1)[1] for i in this_partitions]
        str_parts_list_int = map(int, str_parts_list)
        print "      ##  for table: %s" % item
        print "      ##  ",
        print str_parts_list_int

        for partition in str_parts_list_int:
            if partition < timenow - config.mrqos_table_delete:
                try:
                    print "      ##  handling table: %s with ts=%s" % (
                        item, str(partition))
                    # drop partitions (ok even if partition does not exist)
                    hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(
                        partition)
                    beeline.bln_e(hiveql_str)
                    # remove data from HDFS (ok even if folder in hdfs does not exist)
                    hdfs_d = os.path.join(config.hdfs_table, item,
                                          'ts=%s' % partition)
                    hdfsutil.rm(hdfs_d, r=True)
                except sp.CalledProcessError as e:
                    print ">> failed in hive table clean up in table: %s." % item
                    print e.message
Exemple #8
0
def main():
    """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """
    ts = calendar.timegm(time.gmtime())
    print "###################"
    print "# Performing the hourly mrqos_region summary"
    print "# starting processing time is " + str(ts)
    print "###################"
    ts_last_hour = ts - 3600
    datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour)))
    hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour)))
    hour_list = [str("%02d" % x) for x in range(24)]
    hour_list = [x for x in hour_list if x <= hourstamp]
    region_summary_retrial_max = 10

    # check if the summary has been performed on this particular hour (last hour)
    folders_day = '/'.join(
        str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1])

    # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one
    if hdfsutil.test_dic(folders_day):
        hdfsutil.mkdir(folders_day)

    folders_in = [folders_day + '/hour=%s' % x for x in hour_list]
    folders_out = hdfsutil.ls(folders_day)

    folders_missing = [x for x in folders_in if x not in folders_out]
    folders_missing.sort(reverse=True)

    for item in folders_missing:
        hourstamp = item[-2:]
        print "    ****  missing data for day = %s, hour = %s." % (datestamp,
                                                                   hourstamp),
        f = open(
            os.path.join(config.mrqos_hive_query,
                         'mrqos_region_view_hour.hive'), 'r')
        strcmd = f.read()
        strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp,
                             datestamp, hourstamp)
        f.close()
        strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (
            datestamp, hourstamp)
        query_result_file = os.path.join(
            config.mrqos_query_result,
            'region_view_hour.%s.%s.csv' % (datestamp, hourstamp))
        print "    ****  perform beeline for hourly summary for day = %s, hour = %s." % (
            datestamp, hourstamp)
        count_retrial = 0
        while count_retrial < region_summary_retrial_max:
            try:
                beeline.bln_e(strcmd_s)
                try:
                    beeline.bln_e_output(strcmd_g, query_result_file)
                except:
                    print "    ****  copy to local failed!"
                break
            except sp.CalledProcessError as e:
                # delete the folder if summarization failed.
                print "    ****  summarization failed upto #retrials=" + str(
                    count_retrial)
                print "    ****  ",
                print e.message
                hdfsutil.rm(config.hdfs_qos_rg_view_hour %
                            (datestamp, hourstamp),
                            r=True)
                count_retrial += 1