def host_usagetest_consumer(): '''host_usagetest_consumer() -> takes usage test results from producers of such a metric ( 1-Nagios, 2-url of a UI) and populates WMSMonitor database ''' import os, commands, sys, fpformat sys.path.append('../common') import time import datetime import readconf_func import logging import socket import MySQLdb import urllib confvar = readconf_func.readconf() #CONNECTING TO DB #Opening myslq db connection logger.info("Starting db connection") try: db = MySQLdb.connection(host=confvar.get('WMSMON_DB_HOST'), user=confvar.get('WMSMON_DB_USER'), passwd=confvar.get('WMSMON_DB_PWD'), db=confvar.get('WMSMON_DB_NAME')) except Exception, e: stri2 = "ERROR CONNECTING TO WMSMonitor DB: " + str(e) logger.error(stri2) logger.error( "ERROR: Please check mysql daemon is running and connection parameters are correct!" ) sys.exit(1)
def host_usagetest_consumer(): '''host_usagetest_consumer() -> takes usage test results from producers of such a metric ( 1-Nagios, 2-url of a UI) and populates WMSMonitor database ''' import os, commands, sys, fpformat sys.path.append('../common') import time import datetime import readconf_func import logging import socket import MySQLdb import urllib confvar=readconf_func.readconf() #CONNECTING TO DB #Opening myslq db connection logger.info("Starting db connection") try: db = MySQLdb.connection(host=confvar.get('WMSMON_DB_HOST'),user=confvar.get('WMSMON_DB_USER'),passwd=confvar.get('WMSMON_DB_PWD'),db=confvar.get('WMSMON_DB_NAME')) except Exception,e: stri2= "ERROR CONNECTING TO WMSMonitor DB: " + str(e) logger.error(stri2) logger.error("ERROR: Please check mysql daemon is running and connection parameters are correct!") sys.exit(1)
def wms_balancing_arbiter(): '''wms_balancing_arbiter() -> updating wms instances available behind an alias depending on the load of the instances according to the load metric provided by wms_balancing_metric function Return None if errors are raised during calculation. ''' import os, commands, sys, fpformat sys.path.append('../common') import time import datetime import readconf_func import logging import socket import MySQLdb import logpredef logger = logging.getLogger('wms_balancing_arbiter') conf=readconf_func.readconf() #+++++++++++++++++++++++++++++ #Opening myslq db connection try: db = MySQLdb.connection(host=conf.get('WMSMON_DB_HOST'),user=conf.get('WMSMON_DB_USER'),passwd=conf.get('WMSMON_DB_PWD'),db=conf.get('WMSMON_DB_NAME')) logger.info("Starting db connection") except Exception,e: strxx= "ERROR CONNECTING TO WMSMonitor DB: " + str(e) logger.error(strxx) logger.error("ERROR: Please check mysql daemon is running and connection parameters are correct!") sys.exit(1)
def run(self): #INIZIALIZATION logger = logging.getLogger('data_collector') TIME_AT_START = time.time() logger.info('THIS IS WMSMonitor data_collector_daemon') logger.info('Reading wmsmon conf file') confvar=readconf_func.readconf(); #CONNECTING TO DB #Opening myslq db connection logger.info("Starting db connection") try: db = MySQLdb.connection(host=confvar.get('WMSMON_DB_HOST'),user=confvar.get('WMSMON_DB_USER'),passwd=confvar.get('WMSMON_DB_PWD'),db=confvar.get('WMSMON_DB_NAME')) except Exception,e: str= "ERROR CONNECTING TO WMSMonitor DB: " + str(e) logger.error(str) logger.error("ERROR: Please check mysql daemon is running and connection parameters are correct!") sys.exit(1)
def run(self): #INIZIALIZATION logger = logging.getLogger('data_collector') TIME_AT_START = time.time() logger.info('THIS IS WMSMonitor data_collector_daemon') logger.info('Reading wmsmon conf file') confvar = readconf_func.readconf() #CONNECTING TO DB #Opening myslq db connection logger.info("Starting db connection") try: db = MySQLdb.connection(host=confvar.get('WMSMON_DB_HOST'), user=confvar.get('WMSMON_DB_USER'), passwd=confvar.get('WMSMON_DB_PWD'), db=confvar.get('WMSMON_DB_NAME')) except Exception, e: str = "ERROR CONNECTING TO WMSMonitor DB: " + str(e) logger.error(str) logger.error( "ERROR: Please check mysql daemon is running and connection parameters are correct!" ) sys.exit(1)
def run(self): #INIZIALIZATION logger = logging.getLogger('cft') TIME_AT_START = time.time() confvar=readconf_func.readconf(); logger.info('#########################################') logger.info('## This is the CNAF File Transfer tool ##') logger.info('#########################################') timenow_str = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(TIME_AT_START)) logger.info('cft.py started on ' + timenow_str) logger.info('Configuration file read: ' + confvar['CONF_FILE1']) logger.info('Configuration file read: ' + confvar['CONF_FILE2']) ############################################################# ### Look for already running wrapper process pname = 'cft.py' RUNNING = check_running_func.check_running(pname) if RUNNING: logger.error('Another cft.py is running. Aborting') sys.exit(1) ############################################################## #CHECKING MSG PATHS if (os.access(confvar.get('INPUT_FILES_PATH'),os.F_OK) == False): logger.error('NOT EXISTING DIRECTORY: ' + confvar.get('INPUT_FILES_PATH') + '. Please check configuration file\n') sys.exit(1) if (os.access(confvar.get('PROCESSED_FILES_PATH'),os.F_OK) == False): logger.error('NOT EXISTING DIRECTORY: ' + confvar.get('PROCESSED_FILES_PATH') + '. Please check configuration file\n') sys.exit(1) #Starting daemon while True: #Checking for new DATA Messages list=os.listdir(confvar.get('INPUT_FILES_PATH')) if len(list) == 0: logger.info("No new file to process") for msg in list: if (os.access(confvar.get('INPUT_FILES_PATH') + '/' + msg,os.F_OK) == True): #ACCESSING INPUT FILE logger.info('Working on file: ' + msg) msghdl = open(confvar.get('INPUT_FILES_PATH') + '/' + msg,'r') lines = msghdl.readlines() for line in lines: logger.info(line) linesp = line.split(' ') if len(linesp) < 2: logger.error("File " + msg + " wrongly formatted. Please check manually. Exiting!") sys.exit(1) else: file_in = linesp[0].strip().rstrip() file_out = linesp[1].strip().rstrip() logger.info("file_in = " + file_in) logger.info("file_out = " + file_out) globus_url_copy_func.globus_url_copy(confvar,file_in,file_out) msghdl.close() status = os.system('mv ' + confvar.get('INPUT_FILES_PATH') + '/' + msg + ' ' + confvar.get('PROCESSED_FILES_PATH')) if status != 0: logger.error('Cannot move processed file to ' + confvar.get('PROCESSED_FILES_PATH') + '. Please check manually. Exiting!\n') sys.exit(1) logger.info("Waiting for 5 seconds before checking for new files") time.sleep(5)
#!/usr/bin/env python # J.M. Dana # [email protected] import sys, os #sys.path.append('./stomp/') import stomp import time import logpredef_wmslb import logging import readconf_func from socket import gethostname logger = logging.getLogger('send_data_to_activemq') confvar = readconf_func.readconf() sender_hostname = gethostname() class MyListener(object): def on_connecting(self, host_and_port): logger.info('connecting...') #self.c.connect(wait=True) def on_disconnected(self): logger.info("lost connection") def on_message(self, headers, body): #self.__print_async("MESSAGE", headers, body) logger.info('MESSAGE')
def wms_balancing_metric(WMS): '''wms_balancing_metric() -> returning a list conatining: memusage,loadcpulimit,memlimit,disklimit,fdrain,fload,ftraversaltime metric for wms load balancing is: <0 if (service is failing)||(service is in drain) >0 if (service is available ) N.B. the higher the number the higher the load on wms Return None if errors are raised during calculation. ''' import os, commands, sys, fpformat sys.path.append('../../common') sys.path.append('../../common/classes') import time, urllib import datetime import readconf_func import f_metric_func import wms_class import socket #Initializing import logging logger = logging.getLogger('wms_balancing_metric') def mk_float_or_0(val): try: fval = float(val) except: return 0 return fval #INITIALIZATION confvar = readconf_func.readconf() fdrain = 1 fload = 1 ftraversaltime = 1 loadcpulimit = 15 memlimit = 99 memusage = 1 disklimit = 90 wmsdata = [] ########### LOAD BALANCING PARAMETERS ##################### LATENCY = 0 #confvar.get('LATENCY') LATENCY_PATH = ' ' #confvar.get('LATENCY_PATH') SUBMISSION_TEST = 0 # confvar.get('SUBMISSION_TEST') NAGIOS_PATH = ' ' #confvar.get('NAGIOS_PATH') LOAD_BALANCING_SITE_CONTACT = 'root@localhost' #confvar.get('LOAD_BALANCING_SITE_CONTACT') #################################################### ######################################################### #Calculating fdrain component #checks on daemons logger.info('checking daemons') for dae in WMS.daemons_dict.itervalues(): if dae != '0' and dae != 'Null': logger.info('fdrain = -1 because of daemons:' + str(dae)) fdrain = -1 break env_script = confvar.get('ENV_FILE') #checkung whether the wms has been manually put in drain cmddrain = '. ' + env_script + '; echo $GLITE_LOCATION_VAR' stddrain = os.popen(cmddrain) strtmp = stddrain.readlines() drainfile = strtmp[0].strip() + '/.drain' if (os.access(drainfile, os.F_OK) == True): logger.info('fdrain = -1 because of drainfile presence') fdrain = -1 #checking whether the wms is in autodrain for overload detection #cmd = "grep glite_wms_wmproxy_load_monitor /etc/glite-wms/glite_wms.conf |grep jobSubmit" cmdwmsconfig = '. ' + env_script + '; echo $GLITE_WMS_CONFIG_DIR' stddrain = os.popen(cmdwmsconfig) strtmp = stddrain.readlines() WMSFILE = strtmp[0].strip() + '/glite_wms.conf' cmd = "grep glite_wms_wmproxy_load_monitor " + WMSFILE + " |grep jobSubmit" std = os.popen(cmd) stdstr1 = std.readlines() # if everything is ok.... if (len(stdstr1) > 0): cmd = ". " + env_script + ";" + stdstr1[ 0][stdstr1[0].find("\"") + 1:stdstr1[0].find("\"", stdstr1[0].find("\"") + 1)] logger.info("invoking jobsubmit script: " + cmd) status = os.system(cmd + ' > /dev/null 2>&1') if (status != 0): logger.info( 'fdrain = -1 because exit status is != 0 for command :') logger.info(cmd) fdrain = -1 std = os.popen(cmd) stdstr = std.readlines() if (len(stdstr) > 0): try: loadcpulimit = [ x for x in stdstr if x.startswith('Threshold for Load Average(15 min)') ][0].split()[5].strip() memlimit = [ x for x in stdstr if x.startswith('Threshold for Memory Usage') ][0].split()[4].strip() memusage = [ x for x in stdstr if x.startswith('Threshold for Memory Usage') ][0].split()[len(stdstr[3].split()) - 1].strip('%') disklimit = [ x for x in stdstr if x.startswith('Threshold for Disk') ][0].split()[4].strip('%') except: logger.error( "Unable to parse /sbin/glite_wms_wmproxy_load_monitor script output " ) return None else: logger.error("Problem reading glite_wms.conf file") return None #if status == 1: # fdrain = -1; server_hostname = socket.getfqdn() #Site Nagios Submission Test nagiossubtest = 1 if SUBMISSION_TEST == '1': nagiossubtest = 1 cmd1 = 'wget -q ' + NAGIOS_PATH logger.info("BALANCING COMMAND: " + cmd1) if (os.system(cmd1) == 0): #checking date cmdcheck = 'grep ' + server_hostname + ' ' + NAGIOS_PATH.split( '/')[NAGIOS_PATH.count('/')] std = os.popen(cmdcheck) stdstr = std.readlines() if (len(stdstr) > 0): timestr = stdstr[0].split('\t')[0] deltatime = int(time.time()) - int( time.mktime(time.strptime(timestr, "%Y-%m-%d %H:%M:%S"))) logger.info("NAGIOS DELTATIME: " + deltatime) print "NAGIOS DELTATIME: ", deltatime, '\n' if (deltatime < 3600): logger.info("NAGIOS CURRENT STATUS SUBTEST: " + stdstr[0].split('\t')[1]) if stdstr[0].split('\t')[1] == '2': nagiossubtest = -1 elif stdstr[0].split('\t')[1] == '3': SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write( "Subject: WARNING NAGIOS LOAD BALANCING SUBMISSION TEST FAILS\n" ) p.write( "\n") # blank line separating headers from body p.write("WARNING: SUBMISSION TEST TOO OLD!!!\n\n\n") p.write("FILE : " + NAGIOS_PATH + "\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) else: SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write( "Subject: WARNING NAGIOS LOAD BALANCING SUBMISSION TEST FAILS\n" ) p.write("\n") # blank line separating headers from body p.write("WARNING: SUBMISSION TEST TOO OLD!!!\n\n\n") p.write("FILE : " + NAGIOS_PATH + "\n") p.write("For wmsserver: " + server_hostname + "\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) cmdrm = 'rm -f ' + NAGIOS_PATH.split('/')[NAGIOS_PATH.count('/')] status = os.system(cmdrm + ' 2&>1') else: SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write( "Subject: WARNING NAGIOS LOAD BALANCING SUBMISSION TEST FAILS\n" ) p.write("\n") # blank line separating headers from body p.write("WARNING: COULD NOT READ SUBMISSION TEST RESULTS!!!\n") p.write("PROBLEMS WHILE DOWNLOADING FILE : " + NAGIOS_PATH + "\n") p.write("For wmsserver: " + server_hostname + "\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) if fdrain > 0: fdrain = fdrain * nagiossubtest #CMS Latency Monitor Submission Test latencysubtest = 1 if LATENCY == '1': cmd1 = 'wget -q ' + LATENCY_PATH + server_hostname + '.log' if (os.system(cmd1) == 0): #checking date cmdcheck = 'tail -1 submit-tracks_' + server_hostname + '.log |awk \'{print $1}\'' std = os.popen(cmdcheck) stdstr = std.readlines() if (len(stdstr) > 0): timestr = stdstr[0].strip(':\n') deltatime = int(time.time()) - int( time.mktime(time.strptime(timestr, "%Y-%m-%d@%H.%M.%S"))) if deltatime < 1800: cmd2 = 'tail -1 submit-tracks_' + server_hostname + '.log|grep -c " done in"' std = os.popen(cmd2) stdstr = std.readlines() if (len(stdstr) > 0): latencysubtest = int(stdstr[0].strip('\n')) if latencysubtest == 0: latencysubtest = -1 else: SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write( "Subject: WARNING LATENCY LOAD BALANCING SUBMISSION TEST FAILS\n" ) p.write("\n") # blank line separating headers from body p.write("WARNING: SUBMISSION TEST TOO OLD!!!\n\n\n") p.write("FILE : " + LATENCY_PATH + server_hostname + ".log\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) else: SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write( "Subject: WARNING LATENCY LOAD BALANCING SUBMISSION TEST FAILS\n" ) p.write("\n") # blank line separating headers from body p.write("WARNING: COULD NOT READ SUBMISSION TEST RESULTS!!!\n") p.write("PROBLEMS WHILE DOWNLOADING FILE : " + LATENCY_PATH + server_hostname + ".log\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) if fdrain > 0: fdrain = fdrain * latencysubtest #Calculating load metric logger.info("Building load metric") # print disklimit fload = f_metric_func.f_metric(mk_float_or_0( WMS['cpu_load']), loadcpulimit, 0) + f_metric_func.f_metric( memusage, memlimit, 1) + f_metric_func.f_metric( mk_float_or_0(WMS['disk_sandbox']), disklimit, 1) #Calculating traversaltime metric ftraversaltime = min( f_metric_func.f_metric(mk_float_or_0(WMS['wm_queue']), 500, 0), 1 ) + min(f_metric_func.f_metric(mk_float_or_0( WMS['jc_queue']), 500, 0), 1) + min( f_metric_func.f_metric(mk_float_or_0(WMS['lb_event']), 3000, 0), 1) #summing metric components if fdrain > 0: load_balancing_metric = fload + ftraversaltime else: load_balancing_metric = fdrain #writing resulting metric to file filename = confvar.get('LOAD_BALANCING_FILENAME') try: logger.info('Trying to open file : ' + filename) f = open(filename, mode='a') logger.info("writing load balancing metric to file: fdrain=" + str(fdrain) + ", fload= " + str(fload) + ", ftraversaltime= " + str(ftraversaltime)) f.write(str(load_balancing_metric) + '\n') f.close() except IOerror: logger.error('CANNOT ACCESS FILE : ' + filename) #{'wm_queue':null_str,'jc_queue':null_str,'lb_event':null_str,'loadb_fdrain':null_str,'loadb_ftraversaltime':null_str,'loadb_fload':null_str,'loadb_fmetric':null_str,'condor_running':null_str,'condor_idle':null_str,'condor_current':null_str,'ism_size':null_str,'ism_entries':null_str,'gftp_con':null_str,'FD_WM':null_str,'FD_LM':null_str,'FD_JC':null_str,'FD_LL':null_str,'loadb_memusage':null_str,'ice_running':null_str,'ice_idle':null_str,'ice_pending':null_str,'ice_held':null_str,'ice_queue':null_str,'cpu_load':null_str} WMS['loadb_memusage'] = float(memusage) / float(memlimit) #WMS['loadcpulimit'] = loadcpulimit #WMS['memlimit'] = memlimit #removed from class to be returned as % #WMS['disklimit'] = disklimit # removed from class to be returned as % WMS['loadb_fdrain'] = fdrain WMS['loadb_fload'] = fload WMS['loadb_ftraversaltime'] = ftraversaltime WMS['loadb_fmetric'] = load_balancing_metric # logger.info('memusage: ' + str(WMS['memusage'])) # logger.info('fdrain: '+ str(WMS['fdrain'])) # logger.info('fload: '+ str(WMS['fload'])) # logger.info('ftraversaltime: '+ str(WMS['ftraversaltime'])) # logger.info('fmetric: '+ str(WMS['fmetric'])) #f=open('/tmp/loadbalancingtest.txt', mode = 'a') #strtest= str(int(time.time())) + ' ' + str(load_balancing_metric) + ' ' + str(fdrain) + ' ' + str(fload) + ' ' + str(ftraversaltime) + ' ' + str(memusage) + ' ' + str(memlimit) + ' ' + WMS['load'] + ' ' + str(loadcpulimit) + ' ' + WMS['sandbox'] + ' ' + str(disklimit) + ' ' + WMS['input_fl'] + ' ' + '1000' + ' ' + WMS['queue_fl'] + ' ' + '1000' + ' ' + WMS['dg20'] + ' ' + '3000' #f.write(strtest + '\n') f.close() cmd3 = 'rm -f submit-tracks_' + server_hostname + '.log' os.system(cmd3) return 0
def wms_balancing_metric(WMS): '''wms_balancing_metric() -> returning a list conatining: memusage,loadcpulimit,memlimit,disklimit,fdrain,fload,ftraversaltime metric for wms load balancing is: <0 if (service is failing)||(service is in drain) >0 if (service is available ) N.B. the higher the number the higher the load on wms Return None if errors are raised during calculation. ''' import os, commands, sys, fpformat sys.path.append('../../common') sys.path.append('../../common/classes') import time, urllib import datetime import readconf_func import f_metric_func import wms_class import socket #Initializing import logging logger = logging.getLogger('wms_balancing_metric') def mk_float_or_0(val): try: fval = float(val) except: return 0 return fval #INITIALIZATION confvar=readconf_func.readconf() fdrain=1 fload=1 ftraversaltime=1 loadcpulimit = 15 memlimit = 99 memusage = 1 disklimit = 90 wmsdata = [] ########### LOAD BALANCING PARAMETERS ##################### LATENCY = 0 #confvar.get('LATENCY') LATENCY_PATH = ' ' #confvar.get('LATENCY_PATH') SUBMISSION_TEST = 0 # confvar.get('SUBMISSION_TEST') NAGIOS_PATH = ' ' #confvar.get('NAGIOS_PATH') LOAD_BALANCING_SITE_CONTACT = 'root@localhost' #confvar.get('LOAD_BALANCING_SITE_CONTACT') #################################################### ######################################################### #Calculating fdrain component #checks on daemons logger.info('checking daemons') for dae in WMS.daemons_dict.itervalues(): if dae != '0' and dae != 'Null': logger.info('fdrain = -1 because of daemons:' + str(dae)) fdrain = -1 break env_script = confvar.get('ENV_FILE') #checkung whether the wms has been manually put in drain cmddrain = '. ' + env_script + '; echo $GLITE_LOCATION_VAR' stddrain = os.popen(cmddrain) strtmp = stddrain.readlines() drainfile = strtmp[0].strip() + '/.drain' if (os.access(drainfile,os.F_OK) == True): logger.info('fdrain = -1 because of drainfile presence') fdrain = -1 #checking whether the wms is in autodrain for overload detection #cmd = "grep glite_wms_wmproxy_load_monitor /etc/glite-wms/glite_wms.conf |grep jobSubmit" cmdwmsconfig = '. ' + env_script + '; echo $GLITE_WMS_CONFIG_DIR' stddrain = os.popen(cmdwmsconfig) strtmp = stddrain.readlines() WMSFILE = strtmp[0].strip() + '/glite_wms.conf' cmd = "grep glite_wms_wmproxy_load_monitor " + WMSFILE + " |grep jobSubmit" std = os.popen(cmd) stdstr1 = std.readlines() # if everything is ok.... if ( len(stdstr1) > 0 ): cmd= ". " + env_script + ";" + stdstr1[0][stdstr1[0].find("\"")+1:stdstr1[0].find("\"", stdstr1[0].find("\"")+1)] logger.info("invoking jobsubmit script: " +cmd) status=os.system(cmd + ' > /dev/null 2>&1') if (status != 0): logger.info('fdrain = -1 because exit status is != 0 for command :') logger.info(cmd) fdrain=-1 std = os.popen(cmd) stdstr = std.readlines() if ( len(stdstr) > 0 ) : try: loadcpulimit=[x for x in stdstr if x.startswith('Threshold for Load Average(15 min)')][0].split()[5].strip() memlimit=[x for x in stdstr if x.startswith('Threshold for Memory Usage')][0].split()[4].strip() memusage=[x for x in stdstr if x.startswith('Threshold for Memory Usage')][0].split()[len(stdstr[3].split())-1].strip('%') disklimit=[x for x in stdstr if x.startswith('Threshold for Disk')][0].split()[4].strip('%') except: logger.error("Unable to parse /sbin/glite_wms_wmproxy_load_monitor script output ") return None else : logger.error("Problem reading glite_wms.conf file") return None #if status == 1: # fdrain = -1; server_hostname = socket.getfqdn() #Site Nagios Submission Test nagiossubtest = 1 if SUBMISSION_TEST == '1': nagiossubtest = 1 cmd1 = 'wget -q ' + NAGIOS_PATH logger.info("BALANCING COMMAND: " + cmd1 ) if ( os.system(cmd1) == 0 ): #checking date cmdcheck = 'grep ' + server_hostname + ' ' + NAGIOS_PATH.split('/')[NAGIOS_PATH.count('/')] std = os.popen(cmdcheck) stdstr = std.readlines() if ( len(stdstr) > 0 ): timestr = stdstr[0].split('\t')[0] deltatime = int(time.time())-int(time.mktime(time.strptime(timestr,"%Y-%m-%d %H:%M:%S"))) logger.info("NAGIOS DELTATIME: " + deltatime ) print "NAGIOS DELTATIME: ", deltatime,'\n' if (deltatime < 3600) : logger.info("NAGIOS CURRENT STATUS SUBTEST: " + stdstr[0].split('\t')[1] ) if stdstr[0].split('\t')[1] == '2': nagiossubtest = -1 elif stdstr[0].split('\t')[1] == '3': SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write("Subject: WARNING NAGIOS LOAD BALANCING SUBMISSION TEST FAILS\n") p.write("\n") # blank line separating headers from body p.write("WARNING: SUBMISSION TEST TOO OLD!!!\n\n\n") p.write("FILE : " + NAGIOS_PATH + "\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) else: SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write("Subject: WARNING NAGIOS LOAD BALANCING SUBMISSION TEST FAILS\n") p.write("\n") # blank line separating headers from body p.write("WARNING: SUBMISSION TEST TOO OLD!!!\n\n\n") p.write("FILE : " + NAGIOS_PATH + "\n") p.write("For wmsserver: " + server_hostname + "\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) cmdrm = 'rm -f ' + NAGIOS_PATH.split('/')[NAGIOS_PATH.count('/')] status=os.system(cmdrm + ' 2&>1') else : SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write("Subject: WARNING NAGIOS LOAD BALANCING SUBMISSION TEST FAILS\n") p.write("\n") # blank line separating headers from body p.write("WARNING: COULD NOT READ SUBMISSION TEST RESULTS!!!\n") p.write("PROBLEMS WHILE DOWNLOADING FILE : " + NAGIOS_PATH + "\n") p.write("For wmsserver: " + server_hostname + "\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) if fdrain > 0: fdrain = fdrain * nagiossubtest #CMS Latency Monitor Submission Test latencysubtest = 1 if LATENCY == '1': cmd1 = 'wget -q ' + LATENCY_PATH + server_hostname + '.log' if ( os.system(cmd1) == 0 ): #checking date cmdcheck = 'tail -1 submit-tracks_' + server_hostname + '.log |awk \'{print $1}\'' std = os.popen(cmdcheck) stdstr = std.readlines() if ( len(stdstr) > 0 ): timestr = stdstr[0].strip(':\n') deltatime = int(time.time())-int(time.mktime(time.strptime(timestr,"%Y-%m-%d@%H.%M.%S"))) if deltatime < 1800: cmd2 = 'tail -1 submit-tracks_' + server_hostname + '.log|grep -c " done in"' std = os.popen(cmd2) stdstr = std.readlines() if ( len(stdstr) > 0 ) : latencysubtest = int(stdstr[0].strip('\n')) if latencysubtest == 0: latencysubtest = -1 else: SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write("Subject: WARNING LATENCY LOAD BALANCING SUBMISSION TEST FAILS\n") p.write("\n") # blank line separating headers from body p.write("WARNING: SUBMISSION TEST TOO OLD!!!\n\n\n") p.write("FILE : " + LATENCY_PATH + server_hostname + ".log\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) else : SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + LOAD_BALANCING_SITE_CONTACT + "\n") p.write("Subject: WARNING LATENCY LOAD BALANCING SUBMISSION TEST FAILS\n") p.write("\n") # blank line separating headers from body p.write("WARNING: COULD NOT READ SUBMISSION TEST RESULTS!!!\n") p.write("PROBLEMS WHILE DOWNLOADING FILE : " + LATENCY_PATH + server_hostname + ".log\n") sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) if fdrain > 0: fdrain = fdrain * latencysubtest #Calculating load metric logger.info("Building load metric") # print disklimit fload = f_metric_func.f_metric( mk_float_or_0(WMS['cpu_load']) , loadcpulimit, 0 ) + f_metric_func.f_metric( memusage , memlimit, 1 ) + f_metric_func.f_metric( mk_float_or_0(WMS['disk_sandbox']) , disklimit , 1 ) #Calculating traversaltime metric ftraversaltime = min(f_metric_func.f_metric( mk_float_or_0(WMS['wm_queue']) , 500 , 0 ) , 1 ) + min(f_metric_func.f_metric(mk_float_or_0(WMS['jc_queue']) , 500 , 0 ) , 1 ) + min(f_metric_func.f_metric( mk_float_or_0(WMS['lb_event']) , 3000 , 0 ) , 1 ) #summing metric components if fdrain > 0: load_balancing_metric = fload + ftraversaltime else: load_balancing_metric = fdrain #writing resulting metric to file filename = confvar.get('LOAD_BALANCING_FILENAME') try: logger.info('Trying to open file : ' + filename ) f=open(filename, mode = 'a') logger.info("writing load balancing metric to file: fdrain=" + str(fdrain) + ", fload= " + str(fload) + ", ftraversaltime= " + str(ftraversaltime)) f.write(str(load_balancing_metric) + '\n') f.close() except IOerror : logger.error('CANNOT ACCESS FILE : ' + filename ) #{'wm_queue':null_str,'jc_queue':null_str,'lb_event':null_str,'loadb_fdrain':null_str,'loadb_ftraversaltime':null_str,'loadb_fload':null_str,'loadb_fmetric':null_str,'condor_running':null_str,'condor_idle':null_str,'condor_current':null_str,'ism_size':null_str,'ism_entries':null_str,'gftp_con':null_str,'FD_WM':null_str,'FD_LM':null_str,'FD_JC':null_str,'FD_LL':null_str,'loadb_memusage':null_str,'ice_running':null_str,'ice_idle':null_str,'ice_pending':null_str,'ice_held':null_str,'ice_queue':null_str,'cpu_load':null_str} WMS['loadb_memusage'] = float(memusage)/float(memlimit) #WMS['loadcpulimit'] = loadcpulimit #WMS['memlimit'] = memlimit #removed from class to be returned as % #WMS['disklimit'] = disklimit # removed from class to be returned as % WMS['loadb_fdrain'] = fdrain WMS['loadb_fload'] = fload WMS['loadb_ftraversaltime'] = ftraversaltime WMS['loadb_fmetric'] = load_balancing_metric # logger.info('memusage: ' + str(WMS['memusage'])) # logger.info('fdrain: '+ str(WMS['fdrain'])) # logger.info('fload: '+ str(WMS['fload'])) # logger.info('ftraversaltime: '+ str(WMS['ftraversaltime'])) # logger.info('fmetric: '+ str(WMS['fmetric'])) #f=open('/tmp/loadbalancingtest.txt', mode = 'a') #strtest= str(int(time.time())) + ' ' + str(load_balancing_metric) + ' ' + str(fdrain) + ' ' + str(fload) + ' ' + str(ftraversaltime) + ' ' + str(memusage) + ' ' + str(memlimit) + ' ' + WMS['load'] + ' ' + str(loadcpulimit) + ' ' + WMS['sandbox'] + ' ' + str(disklimit) + ' ' + WMS['input_fl'] + ' ' + '1000' + ' ' + WMS['queue_fl'] + ' ' + '1000' + ' ' + WMS['dg20'] + ' ' + '3000' #f.write(strtest + '\n') f.close() cmd3 = 'rm -f submit-tracks_' + server_hostname + '.log' os.system(cmd3) return 0
nday=int(sys.argv[1]) if nday > 0: print "\nUsage:\n" print "globus_error_detector.py NDAYAGO\n" print "ERROR: NDAYAGO must be <=0 " sys.exit(1) SENSORFLAG=int(sys.argv[2]) if ((SENSORFLAG != 0) and (SENSORFLAG != 1)): print "\nUsage:\n" print "globus_error_detector.py NDAYAGO SENSORFLAG\n" print "ERROR: SENSORFLAG must be either 0 or 1 " sys.exit(1) import readconf_func confvar=readconf_func.readconf(); if SENSORFLAG: #JUST PRODUCING DATA FOR WMSMonitor SENSOR data = "\"" + time.strftime("%d %b",time.localtime(time.time()+nday*84600)) + "\"" # datafile=time.strftime("%d%b",time.localtime(time.time()+nday*84600)) datacalc=time.localtime(time.time()+nday*84600)[1:3] print "Globus error detector starts on date: " + data + "\"" # confvar={'GLITE_LOG_DIR':'/var/log/glite','SITE_CONTACT': '[email protected],[email protected],[email protected]'} logfile = '' if ((os.access(confvar.get('GLITE_LOG_DIR') + '/logmonitor_events.log',os.F_OK) == True ) and (os.access(confvar.get('GLITE_LOG_DIR') + '/logmonitor_events.log.1',os.F_OK) == False)): logfile = confvar.get('GLITE_LOG_DIR') + '/logmonitor_events.log'
def lb_query(lbhost,STARTDATE,ENDDATE,DBTYPE): #Initializing logger import logging logger = logging.getLogger('lb_query') confvar = readconf_func.readconf(); users_stats = [] # Establish a connection if DBTYPE == 'LBPROXY': lbhost = confvar['LBPROXY_DB_HOST'] dbuser = confvar['LBPROXY_DB_USER'] dbname = confvar['LBPROXY_DB_NAME'] elif DBTYPE == 'LBSERVER': lbhost = confvar['LB_DB_HOST'] dbuser = confvar['LB_DB_USER'] dbname = confvar['LB_DB_NAME'] logger.info('Establishing a connection with mysql DB') db = MySQLdb.connection(host = lbhost , user = dbuser , db = dbname, passwd = confvar['SERVER_MYSQL_PASSWORD'][1:-1]) ################ MAIN DATA CONTAINER LIST INITIALIZATION ###### wmsdata_list = [] ############################################################### def put_into_wmsdata(wmsdata_list,wmshostname,userdn,fieldlist,valuelist): wmsFOUND = False for wmsdata in wmsdata_list: if wmsdata.host == wmshostname: wmsFOUND = True try: wmsdata.add_user(userdn) except wmsdata_class.UserPresent: # logger.warning('User Already present in wmdata for host: ' + wmsdata.host) for field in fieldlist: wmsdata[userdn][field] = valuelist[fieldlist.index(field)] if not wmsFOUND: wmsdata = wmsdata_class.wmsdata(wmshostname) wmsdata.add_user(userdn) for field in fieldlist: wmsdata[userdn][field] = valuelist[fieldlist.index(field)] wmsdata_list.append(wmsdata) # Run a MySQL query to find the number of single jobs submitted in a given time interval PER USER and PER WMS logger.info('Running a MySQL query to find the number of single jobs submitted in a given time interval PER USER and PER WMS') querystr = "select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where events.event=short_fields.event and code='17' and time_stamp>'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and name='NSUBJOBS' and value='0' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set WMP_in = 0 if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() # logger.debug('FOUND ROW: ' + row ) if row: dn = row[0][0] rowhost = row[0][1] rowWMP_in = row[0][2] put_into_wmsdata(wmsdata_list,rowhost,dn,['WMP_in'],[rowWMP_in]) ###################################################################################################################### ### We decided to take anymore the avg and the std of nodes per collection because they are not summable on more lb ### WHat we do is to take PER USER the total number of jobs in collection, the min and max of nodes per collection ### This are summable and avg calculation can be done on collector side ### Anyway we sum over user on sensors side and we return alse the total number of jobs per collection, min and max of nodes PER WMS ### Summing over wmsdata data will be done at the end of this function ore on the wrapper if the wmsdata_list is returned ########################################################################################################################## # Run a query to find per user and per host the number of collection, the total number of nodes in collection the min and max of nodes per collection logger.info('Running a query to find per user and per host the number of collection, the total number of nodes in collection the min and max of nodes per collection') querystr = "select users.cert_subj, host, COUNT(value), sum(value), min(value),max(value) from events,short_fields inner join users on events.userid=users.userid where events.event=short_fields.event and code='17' and time_stamp>'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and name='NSUBJOBS' and short_fields.event='0' and value>'0' group by users.cert_subj,host" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: dn = row[0][0] rowhost = row[0][1] rowWMP_in_col = row[0][2] rowWMP_in_col_nodes = row[0][3] rowWMP_in_col_min_nodes = row[0][4] rowWMP_in_col_max_nodes = row[0][5] put_into_wmsdata(wmsdata_list,rowhost,dn,['WMP_in_col','WMP_in_col_nodes','WMP_in_col_min_nodes','WMP_in_col_max_nodes'],[rowWMP_in_col,rowWMP_in_col_nodes,rowWMP_in_col_min_nodes,rowWMP_in_col_max_nodes]) # Run a query to find PER USER and PER WMS the number of jobs enqued to WM from WMP in a given time interval logger.info("Run a query to find PER USER and PER WMS the number of jobs enqued to WM from WMP in a given time interval") querystr = "select users.cert_subj, host, COUNT(events.jobid) from events,short_fields inner join users on events.userid=users.userid where events.event=short_fields.event and code='4' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and prog='NetworkServer' and name='RESULT' and value='OK' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: dn = row[0][0] rowhost = row[0][1] rowWM_in = row[0][2] put_into_wmsdata(wmsdata_list,rowhost,dn,['WM_in'],[rowWM_in]) # Run a MySQL query to find the number both collection and single jobs enqueued to WM in a given time interval from LogMonitor (i.e. Resubmitted) logger.info('Run a MySQL query to find the number both collection and single jobs enqueued to WM in a given time interval from LogMonitor (i.e. Resubmitted) PER USER and PER WMS') querystr="select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where code='4' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and name='RESULT' and value='OK' and prog='LogMonitor' group by users.cert_subj, host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew=row[0][0][0:index] dn = usernew rowhost = row[0][1] rowWM_in_res = row[0][2] put_into_wmsdata(wmsdata_list,rowhost,dn,['WM_in_res'],[rowWM_in_res]) # Run a MySQL query to find the number single jobs enqueued to Job Controller from WM in a given time interval PER WMS and PER USER logger.info('Run a MySQL query to find the number single jobs enqueued to Job Controller from WM in a given time interval per USER and PER WMS') querystr="select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where code='4' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and name='RESULT' and value='OK' and prog='WorkloadManager' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew=row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJC_in = row[0][2] put_into_wmsdata(wmsdata_list,rowhost,dn,['JC_in'],[rowJC_in]) # Run a MySQL query to find the number single jobs enqueued to Condor from Job Controller in a given time interval PER USER and PER WMS logger.info('Run a MySQL query to find the number single jobs enqueued to Condor from Job Controller in a given time interval PER USER and PER WMS') querystr="select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where code='1' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and name='RESULT' and value='OK' and prog='JobController' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew=row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJC_out = row[0][2] put_into_wmsdata(wmsdata_list,rowhost,dn,['JC_out'],[rowJC_out]) # Run a MySQL query to find the number of jobs done in a given time interval PER USER and PER WMS logger.info('Run a MySQL query to find the number single jobs done successfully in a given time interval PER USER and PER WMS') querystr="select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where events.jobid=short_fields.jobid and code='10' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and prog='LogMonitor' and name='REASON' and (value='Job terminated successfully' or value='Job Terminated Successfully') group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew=row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJOB_DONE = row[0][2] put_into_wmsdata(wmsdata_list,rowhost,dn,['JOB_DONE'],[rowJOB_DONE]) # Run a MySQL query to find the number of jobs aborted in a given time interval PER USER and PER WMS logger.info('Run a MySQL query to find the number single jobs aborted in a given time interval PER USER and PER WMS') querystr="select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events inner join users on events.userid=users.userid where code='12' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew=row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJOB_ABORTED = row[0][2] put_into_wmsdata(wmsdata_list,rowhost,dn,['JOB_ABORTED'],[rowJOB_ABORTED]) # Run a MySQL query to find the DEST_CE of jobs in a given time interval PER WMS logger.info('Run a MySQL query to find DEST_CE of jobs in a given time interval PER WMS') ##### old ce query - this double counts ce for jobs landed onto cream ce #querystr="select value, host, COUNT(value) from (select DISTINCT(short_fields.event),events.jobid, short_fields.value, host from events,short_fields where events.jobid=short_fields.jobid and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and prog='WorkloadManager' and name='DEST_HOST' and value!='localhost' and value!='unavailable' and code='15') as temp group by value, host;" ################################################## ##### New query not to double counting ce for jobs landed onto cream ce querystr="select value,host, count(value) from (select distinct(short_fields.jobid), value, host from short_fields inner join events where events.code='15' and events.prog = 'WorkloadManager' and name='DEST_HOST' and time_stamp > '" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and value!='localhost' and value!='unavailable' and events.jobid=short_fields.jobid) as temp group by value, host;" ################################################## logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: rowCE = row[0][0] rowhost = row[0][1] rowCEcount = row[0][2] wmsFOUND = False for wmsdata in wmsdata_list: if wmsdata.host == rowhost: wmsFOUND = True try: wmsdata.add_ce(rowCE) wmsdata.add_ce_count(rowCE,rowCEcount) except wmsdata_class.CEPresent: # logger.warning('User Already present in wmdata for host: ' + wmsdata.host) wmsdata.add_CE_count(rowCEcount) if not wmsFOUND: wmsdata = wmsdata_class.wmsdata(rowhost) wmsdata.add_ce(rowCE) wmsdata.add_ce_count(rowCE,rowCEcount) wmsdata_list.append(wmsdata) # Run a MySQL query to find the LB used to store the jobs in a given time interval # Available only if DBTYPE = LBPROXY if DBTYPE == 'LBPROXY': logger.info('Run a MySQL query to find the LB used to store the jobs in a given time interval') querystr="select distinct dg_jobid from jobs inner join events on jobs.jobid=events.jobid where events.code = '17' and time_stamp > '" + STARTDATE + "' and time_stamp < '" + ENDDATE + "';" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1,r.num_rows() + 1): row = r.fetch_row() if row: rowLB = row[0][0] LBstr = LBstr = rowLB[rowLB.find('//') + 2 : rowLB.find(':9000') ] for wmsdata in wmsdata_list: wmsdata.add_lb(LBstr) db.close() # filename= confvar['INSTALL_PATH'] +'/sensors//tmp/USERSTATS_' + lbhost + '_' + wmshost + '.txt' # fileusersstats = open(filename,'w') # fileusersstats.write('START OF FILE\n') # for i in range(0,len(users_stats)): # fileusersstats.write(str(users_stats[i][0]) + '|' + str(users_stats[i][1]) + '|' + str(users_stats[i][2]) + '|' + str(users_stats[i][3]) + '|' + str(users_stats[i][4]) + '|' + str(users_stats[i][5]) + '|' + str(users_stats[i][6]) + '|' + str(users_stats[i][7]) + '|' + str(users_stats[i][8]) + '|\n') # fileusersstats.write('END OF FILE\n') # fileusersstats.close() return wmsdata_list
def wms_sensor(): '''wms_sensor() -> list of string in the followind order: running - running condor job as reported by condor_q idle - idle condor job as reported by condor_q current - current condor job as reported by condor_q load - machine load 15 as reported /proc/loadavg input_fl - unprocessed entries in input.fl queue_fl - unprocessed entries in queue.fl dg20 - number of dg20logd files in /var/log/glite ism_size - ism size in 1kB blocks ism_entries - CE ism entries sandbox - Sandbox partition occupancy (in %) tmp - tmp partition occupancy (in %) gftp - number of gftp process FD_WM - number of file descriptors opened by WM FD_LM - number of file descriptors opened by LM FD_JC - number of file descriptors opened by JC FD_LL - number of file descriptors opened by LL LB - status of LB daemon (0 is ok) LL - status of LL daemon (0 is ok) LBPX - status of LBPX daemon (0 is ok) PX - status of PX daemon (0 is ok) FTPD - status of FTPD daemon (0 is ok) JC - status of JC daemon (0 is ok) LM - status of LM daemon (0 is ok) WM - status of WM daemon (0 is ok) WMP - status of WMP daemon (0 is ok) ICE - status of ICE daemon (0 is ok) BDII - status of BDII daemon (0 is ok) NTPD - status of NTPD daemon (0 is ok) varlog - /var/log partition occupancy (in %) varlibmysql - /var/lib/mysql partition occupancy (in %) ''' import os, commands, sys, fpformat sys.path.append('../../common') sys.path.append('../../common/classes') #import MySQLdb import time import readconf_func import socket #Sensor functions import import condor_func import load_func import dg20_func import ism_stat_func import filelists_func import diskspace_checks_func import gftp_num_func import file_desc_func import daemons_status_func import wms_balancing_metric_func import ice_jobs_func import wms_class #Initializing logger import logging logger = logging.getLogger('wms_sensor') confvar = readconf_func.readconf(); # Starting Calling sensor functions.... #################################################################################### #### starting backgroung process, we will look at the end if they finished #################################################################################### ## NO MORE NEEDED #Launching in backgroud the creation of the mappping table #cmd = confvar['INSTALL_PATH'] + "/sensors/bin/wms_usermapping/wms_usermapping_func &" #os.system(cmd) ####################### #Launching the creation of the CE_MM file in background cmd = confvar['INSTALL_PATH'] + "/sensors/bin/CE_MM.sh " + confvar['WORKLOAD_MANAGER_LOG_FILE'] + " " + confvar['CE_MM_FILE'] + " &" os.system(cmd) ###################################################################################### ###################################################################################### # The condor_jobs first... logger.info('Calling condor_jobs function') #Return a list of total, idle, running, held jobs as reported by condor_q condor_list = condor_func.condor_jobs(confvar.get('ENV_FILE')) if condor_list[2] != None: running = condor_list[2] else: running = 'Null' if condor_list[0] != None: current = condor_list[0] else: current = 'Null' if condor_list[1] != None: idle = condor_list[1] else: idle = 'Null' # ...Then the ice_jobs ... logger.info('Calling ice_jobs function') #Return a list of total, idle, running, held jobs as reported by icedb tool ice_dict = ice_jobs_func.ice_jobs(confvar['ENV_FILE']) # ....Then the average cpu load in past 15 min logger.info('Calling load function') loadtmp=load_func.load_cpu() if loadtmp != None: load = loadtmp else: load = 'Null' # Number of jobs in Input.fl and Queue.fl and ice.fl logger.info('Calling filelists function') filelist_tmp = filelists_func.filelists(confvar.get('ENV_FILE')) #print "filelists_tmp = " + str(filelist_tmp) if filelist_tmp[0] != None: input_fl = filelist_tmp[0][0:len(filelist_tmp[0]) - 1] else: input_fl = 'Null' if filelist_tmp[1] != None: queue_fl = filelist_tmp[1][0:len(filelist_tmp[1]) - 1] else: queue_fl = 'Null' if filelist_tmp[2] != None: ice_fl = filelist_tmp[2][0:len(filelist_tmp[1]) - 1] else: ice_fl = 'Null' # ....Then the number of dg20 files in the wms logger.info('Calling dg20log function') dg20 = dg20_func.dg20log( confvar.get('DG20_PATH')) if dg20 == None: dg20 = 'Null' #...Then the ism status logger.info('Calling ism_stat function') ism_tmp = ism_stat_func.ism_stat(confvar.get('ISMDUMP_PATH'),confvar.get('GLITE_LOG_DIR')) if ism_tmp[0] == None: ism_tmp[0] = 'Null' if ism_tmp[1] == None: ism_tmp[1] = 'Null' ism_size = ism_tmp[0] ism_entries = ism_tmp[1] # % of disk occupacy hosting Sandbox and tmp directories logger.info("Calling diskspace_checks function") output_tmp=diskspace_checks_func.diskspace_checks(confvar.get('SANDBOX_PATH'),confvar.get('TMP_PATH'),confvar.get('VAR_LOG_PATH'),confvar.get('VAR_LIB_MYSQL_PATH')) if output_tmp[0] != None: sandbox = output_tmp[0][0:len(output_tmp[0])] else: sandbox = 'Null' if output_tmp[1] != None: tmp = output_tmp[1][0:len(output_tmp[1])] else: tmp = 'Null' if output_tmp[2] != None: varlog = output_tmp[2][0:len(output_tmp[2])] else: varlog = 'Null' if output_tmp[3] != None: varlibmysql = output_tmp[3][0:len(output_tmp[3])] else: varlibmysql = 'Null' # ....Then the number of gridftp sessions in the wms logger.info("Calling gftp_num function") if gftp_num_func.gftp_num() != None: gftp = gftp_num_func.gftp_num() else: gftp = 'Null' #.... Then file descriptors for WM,LM,JC,LL logger.info("Calling file descriptor function") output_tmp=file_desc_func.file_desc(confvar.get('FD_WMS_WM'),confvar.get('FD_WMS_LM'),confvar.get('FD_WMS_JC'),confvar.get('FD_WMS_LBINTERLOG')) if output_tmp[0] != None: FD_WM = output_tmp[0] else: FD_WM = 'Null' if output_tmp[1] != None: FD_LM = output_tmp[1] else: FD_LM = 'Null' if output_tmp[2] != None: FD_JC = output_tmp[2] else: FD_JC = 'Null' if output_tmp[3] != None: FD_LL = output_tmp[3] else: FD_LL = 'Null' #.... Then checking wms daemons status for 'glite-lb-bkserverd','glite-lb-locallogger','glite-lb-proxy', # 'glite-proxy-renewald','glite-wms-ftpd','glite-wms-jc', # 'glite-wms-lm','glite-wms-wm','glite-wms-wmproxy'''' logger.info("Calling daemons status check function") output_tmp=daemons_status_func.daemons_status(confvar.get('GLITE_DAEMONS_PATH')) # print 'daemons:', output_tmp, '\n' # print confvar.get('GLITE_DAEMONS_PATH') if output_tmp[0]!=None: LL = output_tmp[0] else: LL = 'Null' if output_tmp[1]!=None: LBPX = output_tmp[1] else: LBPX = 'Null' if output_tmp[2]!=None: PX = output_tmp[2] else: PX = 'Null' if output_tmp[3]!=None: FTPD = output_tmp[3] else: FTPD = 'Null' if output_tmp[4]!=None: JC = output_tmp[4] else: JC = 'Null' if output_tmp[5]!=None: LM = output_tmp[5] else: LM = 'Null' if output_tmp[6]!=None: WM = output_tmp[6] else: WM = 'Null' if output_tmp[7]!=None: WMP = output_tmp[7] else: WMP = 'Null' if output_tmp[8] != None: ICE = output_tmp[8] else: ICE = 'Null' if output_tmp[9] != None: BDII = output_tmp[9] else: BDII = 'Null' if output_tmp[10] != None: NTPD = output_tmp[10] else: NTPD = 'Null' #Logging fields # Now we create the WMS object hostname = socket.getfqdn() WMS = wms_class.WMS(hostname) WMS['condor_running'] = str(running) WMS['condor_idle'] = str(idle) WMS['condor_current'] = str(current) WMS['ice_idle'] = str(ice_dict['IDLE']) WMS['ice_pending'] = str(ice_dict['PENDING']) WMS['ice_running'] = str(ice_dict['RUNNING']) WMS['ice_held'] = str(ice_dict['HELD']) WMS['cpu_load'] = str(load) WMS['wm_queue'] = str(input_fl) WMS['jc_queue'] = str(queue_fl) WMS['ice_queue'] = str(ice_fl) WMS['ism_size'] = str(ism_size) WMS['ism_entries'] = str(ism_entries) WMS['disk_sandbox'] = str(sandbox) WMS['disk_tmp'] = str(tmp) WMS['disk_varlog'] = str(varlog) WMS['disk_varlibmysql'] = str(varlibmysql) WMS['gftp_con'] = str(gftp) WMS['lb_event'] = dg20 WMS['FD_WM'] = str(FD_WM) WMS['FD_LM'] = str(FD_LM) WMS['FD_JC'] = str(FD_JC) WMS['FD_LL'] = str(FD_LL) #WMS['LB'] = str(LB) # removed in 3.0 #WMS.daemons_dict['LB'] = WMS['LB'] # removed in 3.0 WMS['daemon_LL'] = str(LL) WMS.daemons_dict['daemon_LL'] = WMS['daemon_LL'] WMS['daemon_LBPX'] = str(LBPX) WMS.daemons_dict['daemon_LBPX'] = WMS['daemon_LBPX'] WMS['daemon_PX'] = str(PX) WMS.daemons_dict['daemon_PX'] = WMS['daemon_PX'] WMS['daemon_FTPD'] = str(FTPD) WMS.daemons_dict['daemon_FTPD'] = WMS['daemon_FTPD'] WMS['daemon_JC'] = str(JC) WMS.daemons_dict['daemon_JC'] = WMS['daemon_JC'] WMS['daemon_LM'] = str(LM) WMS.daemons_dict['daemon_LM'] = WMS['daemon_LM'] WMS['daemon_WM'] = str(WM) WMS.daemons_dict['daemon_WM'] = WMS['daemon_WM'] WMS['daemon_WMP'] = str(WMP) WMS.daemons_dict['daemon_WMP'] = WMS['daemon_WMP'] WMS['daemon_ICE'] = str(ICE) WMS.daemons_dict['daemon_ICE'] = WMS['daemon_ICE'] WMS['daemon_BDII'] = str(BDII) WMS.daemons_dict['daemon_BDII'] = WMS['daemon_BDII'] WMS['daemon_NTPD'] = str(NTPD) WMS.daemons_dict['daemon_NTPD'] = WMS['daemon_NTPD'] logger.info('Calling wms_balancing_metric_func') metric_output = wms_balancing_metric_func.wms_balancing_metric(WMS) logger.debug("WMS values collected are:") logger.debug(str(WMS)) # Before reuturning check if the wms_usermapping and CE_MM have finished # If not wait for a maximun of 30 seconds #logger.info("Waiting for usermap to complete its job") file_tmp = confvar['MAPTABLE_FILENAME'] file_tmp2 = confvar['CE_MM_FILE'] MAP_DONE = False MM_DONE = False LOOP_TIMEOUT = int( confvar['LOOP_TIMEOUT'] ) START_LOOP_TIME = time.time() EXIT_THE_LOOP = False while EXIT_THE_LOOP == False and (time.time() - START_LOOP_TIME) < LOOP_TIMEOUT: # if (os.access(file_tmp,os.F_OK) == True) and (os.WEXITSTATUS(os.system(("/usr/sbin/lsof " + file_tmp + " >/dev/null 2>&1"))) == 1): # if (os.access(file_tmp,os.F_OK) == True) and (os.WEXITSTATUS(os.system(("/usr/sbin/lsof " + file_tmp ))) == 1) : #yes, good, the files are accessible # MAP_DONE = True # logger.info("Usermap completed. Returning") # if (os.access(file_tmp2,os.F_OK) == True) and (os.WEXITSTATUS(os.system(("/usr/sbin/lsof " + file_tmp2 ))) == 1) : if (os.access(file_tmp2,os.F_OK) == True) and (os.WEXITSTATUS(os.system(("/usr/sbin/lsof " + file_tmp2 + " >/dev/null 2>&1"))) == 1): #yes, good, the files are accessible MM_DONE = True logger.info("CE_MM completed. Returning") EXIT_THE_LOOP = MM_DONE # if MAP_DONE == False: # logger.warning("Usermap did not complete its job.") #change this log message if MM_DONE == False: logger.warning("CE_MM did not complete its job.") #change this log message # now we should return a wms object and the presence of the mapping file return WMS , MM_DONE
def lb_query(rowhost,STARTDATE,ENDDATE): #Initializing logger import logging logger = logging.getLogger('lb_apiquery') ################ INITIALIZATION ###### confvar = readconf_func.readconf(); API_CMD_PATH = './' wmsdata_list = [] users_stats = [] ####################################### ################# FUNCTION DEFINITION ######### def put_into_wmsdata(wmsdata_list,wmshostname,userdn,fieldlist,valuelist): wmsFOUND = False for wmsdata in wmsdata_list: if wmsdata.host == wmshostname: wmsFOUND = True try: wmsdata.add_user(userdn) except wmsdata_class.UserPresent: pass for field in fieldlist: wmsdata[userdn][field] = valuelist[fieldlist.index(field)] if not wmsFOUND: wmsdata = wmsdata_class.wmsdata(wmshostname) wmsdata.add_user(userdn) for field in fieldlist: wmsdata[userdn][field] = valuelist[fieldlist.index(field)] wmsdata_list.append(wmsdata) def group_by_key(api_output_list,keyposition,CNPROXYFLAG): #INPUTS: # - api_output_list: the output of api query command execution # - keyposition: the position of the key of grouping (ex. user DN or CE queue) in the output_list lines splitted by separator # - SET CNPROXYFLAG to TRUE/1 to group DN which differs only by a "/CN=proxy/CN=proxy" SUFFIX # OUTPUTS: # - dictionary of key and count of occurrences" dictionary={} l_key=[] for l in api_output_list: l_key.append(l.split('\t')[keyposition]) for key in set(l_key): dictionary[key]=l_key.count(key) if CNPROXYFLAG: #grouping users and proxies for key in dictionary.keys(): index = key.find('/CN=proxy/CN=proxy') if index != -1: dn = key[0:index] if dictionary.has_key(dn): dictionary[dn]= dictionary[dn] + dictionary.pop(key) else: dictionary[dn]= dictionary.pop(key) return dictionary def resolve_jobuser(jobid): #INPUTS: # - jobid for which we want to derive user # OUTPUTS: # - job USER DN # N.B. it explouts lbproxy socket if available" import os.path if os.path.exists('/tmp/lb_proxy_serve.sock'): stream= os.popen("./job_status -x /tmp/lb_proxy_serve.sock " + jobid + " |grep owner") output=stream.readlines() if output: return output[0].split(':')[1] else: return 'Null' def checkoutput_to_resolve_jobuser(apiqueryoutput): #INPUTS: # - output lines from apiquery # OUTPUTS: # - job USER DN in lines with (null) string where owner!=user # N.B. it explouts lbproxy socket if available" import os.path if os.path.exists('/tmp/lb_proxy_serve.sock'): # usersoutput = [] logger.debug('entering checkoutput_to_resolve_jobuser function') out=apiqueryoutput #print "out dentro funzioncina prima" ,out, '\n\n' for iji in range(0,len(out)): if out[iji].split('\t')[0].find('(null)')!=-1: logger.debug('found (null) DN, for jobid:' + out[iji].split('\t')[1]) # print 'found (null) DN, for jobid:' + out[iji].split('\t')[1] user=resolve_jobuser(out[iji].split('\t')[1]) logger.debug('substituted with:' + user) user=user.strip().strip('\n').lstrip().strip() # print 'substituted with:' + user tmp=out[iji].replace('(null)',user,1) logger.debug('new line tmp ' +tmp) # print 'new line tmp ' +tmp out[iji]=tmp logger.debug('new line apioutput ' + out[iji]) # print 'new line apioutput ' + out[iji] # print "out dentro funzioncina dopo" , out,'\n\n' return out else: logger.warning('NO lb-proxy-socket-file found, unable to determine some jobs OWNER field') return apiqueryoutput ############################################ ########## STARTING QUERIES ################ # Run a MySQL query to find the number of jobs and collections submitted in a given time interval PER USER logger.info('Running a MySQL query to find the number of jobs submitted in a given time interval PER USER') stream= os.popen(API_CMD_PATH + "/submitted_jobs " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner output=checkoutput_to_resolve_jobuser(output) l_single=[] l_collection_user=[] l_collection_values=[] #SEPARATING SINGLE JOBS FROM COLLECTIONS for l1 in output: if l1.split('\t')[2]=='0': l_single.append(l1) else: l_collection_user.append(l1.split('\t')[0]) l_collection_values.append(l1.split('\t')[2]) dict_tmp=group_by_key(l_single,0,1) #STORING SINGLE JOBS DATA for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,dn,['WMP_in'],[dict_tmp[dn]]) # def put_into_wmsdata(wmsdata_list,wmshostname,userdn,fieldlist,valuelist): #EXTRACTING COLLECTIONS DATA, GROUPING SAME USERS DN & PROXY dict_tmp={} for user in set(l_collection_user): values=[] for count in range(0,len(l_collection_user)): if l_collection_user[count]==user: values.append(int(l_collection_values[count])) #GROUPING DN AND PROXY OF SAME USER index = user.find('/CN=proxy/CN=proxy') if index != -1: #CASE with PROXY dn = key[0:index] if dict_tmp.has_key(dn): dict_tmp[dn][0]= dict_tmp[dn][0] + len(values) dict_tmp[dn][1]= dict_tmp[dn][1] + sum(values) dict_tmp[dn][2]= min(dict_tmp[dn][2], min(values)) dict_tmp[dn][3]= max(dict_tmp[dn][3], max(values)) else: dict_tmp[dn]= [len(values),sum(values),min(values),max(values)] else: #CASE without PROXY : checking whether same user was alredy inserted as proxy if dict_tmp.has_key(user): dict_tmp[user][0]= dict_tmp[user][0] + len(values) dict_tmp[user][1]= dict_tmp[user][1] + sum(values) dict_tmp[user][2]= min(dict_tmp[user][2], min(values)) dict_tmp[user][3]= max(dict_tmp[user][3], max(values)) else: dict_tmp[user]= [len(values),sum(values),min(values),max(values)] #STORING COLLECTIONS DATA for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,user,['WMP_in_col','WMP_in_col_nodes','WMP_in_col_min_nodes','WMP_in_col_max_nodes'],[len(values),sum(values),min(values),max(values)]) #ESPLOITING REGISTER EVENT JOBS TO EXTRACT THE SET OF LB SERVER USED BY CONSIDERED WMS HOST dict_tmp={} l_key=[] #Notice that in LBPROXY CASE JUST 1 WMSHOST is in wmsdata_list. We keep the list as legacy... for wmsdata in wmsdata_list: for l in output: wmsdata.add_lb(l.split('\t')[1].split('/')[2].strip(':9000')) # Run a query to find PER USER and PER WMS the number of jobs enqued to WM from WMP in a given time interval logger.info("Run a query to find PER USER and PER WMS the number of jobs enqued to WM from WMP in a given time interval") stream= os.popen(API_CMD_PATH + "/enqueued_WM_jobs " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner output=checkoutput_to_resolve_jobuser(output) dict_tmp=group_by_key(output,0,1) for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,dn,['WM_in'],[dict_tmp[dn]]) # Run a MySQL query to find the number both collection and single jobs enqueued to WM in a given time interval from LogMonitor (i.e. Resubmitted) logger.info('Run a query to find the number both collection and single jobs enqueued to WM in a given time interval from LogMonitor (i.e. Resubmitted) PER USER and PER WMS') stream= os.popen(API_CMD_PATH + "/resubmitted_WM_jobs " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner output=checkoutput_to_resolve_jobuser(output) dict_tmp=group_by_key(output,0,1) for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,dn,['WM_in_res'],[dict_tmp[dn]]) # Run a MySQL query to find the number single jobs enqueued to Job Controller from WM in a given time interval PER WMS and PER USER logger.info('Run a query to find the number single jobs enqueued to Job Controller from WM in a given time interval per USER and PER WMS') stream= os.popen(API_CMD_PATH + "/enqueued_JSS_jobs " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner output=checkoutput_to_resolve_jobuser(output) dict_tmp=group_by_key(output,0,1) for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,dn,['JC_in'],[dict_tmp[dn]]) # Run a MySQL query to find the number single jobs enqueued to Condor from Job Controller in a given time interval PER USER and PER WMS logger.info('Run a query to find the number single jobs enqueued to Condor from Job Controller in a given time interval PER USER and PER WMS') stream= os.popen(API_CMD_PATH + "/transfer_CONDOR_jobs " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner output=checkoutput_to_resolve_jobuser(output) dict_tmp=group_by_key(output,0,1) for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,dn,['JC_out'],[dict_tmp[dn]]) # Run a MySQL query to find the number of jobs done in a given time interval PER USER and PER WMS logger.info('Run a query to find the number single jobs done successfully in a given time interval PER USER and PER WMS') stream= os.popen(API_CMD_PATH + "/done_events " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner # print 'ouput prima',output output=checkoutput_to_resolve_jobuser(output) # print 'tmpouput ',tmpoutput # output=tmpoutput # print 'ouput dopo',output dict_tmp=group_by_key(output,0,1) for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,dn,['JOB_DONE'],[dict_tmp[dn]]) # Run a MySQL query to find the number of jobs aborted in a given time interval PER USER and PER WMS logger.info('Run a query to find the number single jobs aborted in a given time interval PER USER and PER WMS') stream= os.popen(API_CMD_PATH + "/abort_events " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner output=checkoutput_to_resolve_jobuser(output) dict_tmp=group_by_key(output,0,1) for dn in dict_tmp.keys(): put_into_wmsdata(wmsdata_list,rowhost,dn,['JOB_ABORTED'],[dict_tmp[dn]]) # Run a query to find the DEST_CE of jobs in a given time interval PER WMS logger.info('Run a MySQL query to find DEST_CE of jobs in a given time interval PER WMS') stream= os.popen(API_CMD_PATH + "/CE_histogram " + STARTDATE + " " + ENDDATE) output=stream.readlines() if output: #checkin jobs with null owner output=checkoutput_to_resolve_jobuser(output) dict_tmp=group_by_key(output,2,0) for CE in dict_tmp.keys(): rowCE = CE rowCEcount = dict_tmp[CE] wmsFOUND = False for wmsdata in wmsdata_list: if wmsdata.host == rowhost: wmsFOUND = True try: wmsdata.add_ce(rowCE) wmsdata.add_ce_count(rowCE,rowCEcount) except wmsdata_class.CEPresent: wmsdata.add_CE_count(rowCEcount) if not wmsFOUND: wmsdata = wmsdata_class.wmsdata(rowhost) wmsdata.add_ce(rowCE) wmsdata.add_ce_count(rowCE,rowCEcount) wmsdata_list.append(wmsdata) return wmsdata_list
def lb_query(lbhost, STARTDATE, ENDDATE, DBTYPE): #Initializing logger import logging logger = logging.getLogger('lb_query') confvar = readconf_func.readconf() users_stats = [] # Establish a connection if DBTYPE == 'LBPROXY': lbhost = confvar['LBPROXY_DB_HOST'] dbuser = confvar['LBPROXY_DB_USER'] dbname = confvar['LBPROXY_DB_NAME'] elif DBTYPE == 'LBSERVER': lbhost = confvar['LB_DB_HOST'] dbuser = confvar['LB_DB_USER'] dbname = confvar['LB_DB_NAME'] logger.info('Establishing a connection with mysql DB') db = MySQLdb.connection(host=lbhost, user=dbuser, db=dbname, passwd=confvar['SERVER_MYSQL_PASSWORD'][1:-1]) ################ MAIN DATA CONTAINER LIST INITIALIZATION ###### wmsdata_list = [] ############################################################### def put_into_wmsdata(wmsdata_list, wmshostname, userdn, fieldlist, valuelist): wmsFOUND = False for wmsdata in wmsdata_list: if wmsdata.host == wmshostname: wmsFOUND = True try: wmsdata.add_user(userdn) except wmsdata_class.UserPresent: # logger.warning('User Already present in wmdata for host: ' + wmsdata.host) for field in fieldlist: wmsdata[userdn][field] = valuelist[fieldlist.index( field)] if not wmsFOUND: wmsdata = wmsdata_class.wmsdata(wmshostname) wmsdata.add_user(userdn) for field in fieldlist: wmsdata[userdn][field] = valuelist[fieldlist.index(field)] wmsdata_list.append(wmsdata) # Run a MySQL query to find the number of single jobs submitted in a given time interval PER USER and PER WMS logger.info( 'Running a MySQL query to find the number of single jobs submitted in a given time interval PER USER and PER WMS' ) querystr = "select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where events.event=short_fields.event and code='17' and time_stamp>'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and name='NSUBJOBS' and value='0' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set WMP_in = 0 if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() # logger.debug('FOUND ROW: ' + row ) if row: dn = row[0][0] rowhost = row[0][1] rowWMP_in = row[0][2] put_into_wmsdata(wmsdata_list, rowhost, dn, ['WMP_in'], [rowWMP_in]) ###################################################################################################################### ### We decided to take anymore the avg and the std of nodes per collection because they are not summable on more lb ### WHat we do is to take PER USER the total number of jobs in collection, the min and max of nodes per collection ### This are summable and avg calculation can be done on collector side ### Anyway we sum over user on sensors side and we return alse the total number of jobs per collection, min and max of nodes PER WMS ### Summing over wmsdata data will be done at the end of this function ore on the wrapper if the wmsdata_list is returned ########################################################################################################################## # Run a query to find per user and per host the number of collection, the total number of nodes in collection the min and max of nodes per collection logger.info( 'Running a query to find per user and per host the number of collection, the total number of nodes in collection the min and max of nodes per collection' ) querystr = "select users.cert_subj, host, COUNT(value), sum(value), min(value),max(value) from events,short_fields inner join users on events.userid=users.userid where events.event=short_fields.event and code='17' and time_stamp>'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and name='NSUBJOBS' and short_fields.event='0' and value>'0' group by users.cert_subj,host" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: dn = row[0][0] rowhost = row[0][1] rowWMP_in_col = row[0][2] rowWMP_in_col_nodes = row[0][3] rowWMP_in_col_min_nodes = row[0][4] rowWMP_in_col_max_nodes = row[0][5] put_into_wmsdata(wmsdata_list, rowhost, dn, [ 'WMP_in_col', 'WMP_in_col_nodes', 'WMP_in_col_min_nodes', 'WMP_in_col_max_nodes' ], [ rowWMP_in_col, rowWMP_in_col_nodes, rowWMP_in_col_min_nodes, rowWMP_in_col_max_nodes ]) # Run a query to find PER USER and PER WMS the number of jobs enqued to WM from WMP in a given time interval logger.info( "Run a query to find PER USER and PER WMS the number of jobs enqued to WM from WMP in a given time interval" ) querystr = "select users.cert_subj, host, COUNT(events.jobid) from events,short_fields inner join users on events.userid=users.userid where events.event=short_fields.event and code='4' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and prog='NetworkServer' and name='RESULT' and value='OK' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: dn = row[0][0] rowhost = row[0][1] rowWM_in = row[0][2] put_into_wmsdata(wmsdata_list, rowhost, dn, ['WM_in'], [rowWM_in]) # Run a MySQL query to find the number both collection and single jobs enqueued to WM in a given time interval from LogMonitor (i.e. Resubmitted) logger.info( 'Run a MySQL query to find the number both collection and single jobs enqueued to WM in a given time interval from LogMonitor (i.e. Resubmitted) PER USER and PER WMS' ) querystr = "select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where code='4' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and name='RESULT' and value='OK' and prog='LogMonitor' group by users.cert_subj, host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew = row[0][0][0:index] dn = usernew rowhost = row[0][1] rowWM_in_res = row[0][2] put_into_wmsdata(wmsdata_list, rowhost, dn, ['WM_in_res'], [rowWM_in_res]) # Run a MySQL query to find the number single jobs enqueued to Job Controller from WM in a given time interval PER WMS and PER USER logger.info( 'Run a MySQL query to find the number single jobs enqueued to Job Controller from WM in a given time interval per USER and PER WMS' ) querystr = "select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where code='4' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and name='RESULT' and value='OK' and prog='WorkloadManager' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew = row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJC_in = row[0][2] put_into_wmsdata(wmsdata_list, rowhost, dn, ['JC_in'], [rowJC_in]) # Run a MySQL query to find the number single jobs enqueued to Condor from Job Controller in a given time interval PER USER and PER WMS logger.info( 'Run a MySQL query to find the number single jobs enqueued to Condor from Job Controller in a given time interval PER USER and PER WMS' ) querystr = "select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where code='1' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and events.jobid=short_fields.jobid and events.event=short_fields.event and name='RESULT' and value='OK' and prog='JobController' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew = row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJC_out = row[0][2] put_into_wmsdata(wmsdata_list, rowhost, dn, ['JC_out'], [rowJC_out]) # Run a MySQL query to find the number of jobs done in a given time interval PER USER and PER WMS logger.info( 'Run a MySQL query to find the number single jobs done successfully in a given time interval PER USER and PER WMS' ) querystr = "select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events,short_fields inner join users on events.userid=users.userid where events.jobid=short_fields.jobid and code='10' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and prog='LogMonitor' and name='REASON' and (value='Job terminated successfully' or value='Job Terminated Successfully') group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew = row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJOB_DONE = row[0][2] put_into_wmsdata(wmsdata_list, rowhost, dn, ['JOB_DONE'], [rowJOB_DONE]) # Run a MySQL query to find the number of jobs aborted in a given time interval PER USER and PER WMS logger.info( 'Run a MySQL query to find the number single jobs aborted in a given time interval PER USER and PER WMS' ) querystr = "select users.cert_subj,host,COUNT(DISTINCT(events.jobid)) from events inner join users on events.userid=users.userid where code='12' and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' group by users.cert_subj,host;" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: usernew = row[0][0] index = row[0][0].find('/CN=proxy/CN=proxy') if index != -1: usernew = row[0][0][0:index] dn = usernew rowhost = row[0][1] rowJOB_ABORTED = row[0][2] put_into_wmsdata(wmsdata_list, rowhost, dn, ['JOB_ABORTED'], [rowJOB_ABORTED]) # Run a MySQL query to find the DEST_CE of jobs in a given time interval PER WMS logger.info( 'Run a MySQL query to find DEST_CE of jobs in a given time interval PER WMS' ) ##### old ce query - this double counts ce for jobs landed onto cream ce #querystr="select value, host, COUNT(value) from (select DISTINCT(short_fields.event),events.jobid, short_fields.value, host from events,short_fields where events.jobid=short_fields.jobid and time_stamp >'" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and prog='WorkloadManager' and name='DEST_HOST' and value!='localhost' and value!='unavailable' and code='15') as temp group by value, host;" ################################################## ##### New query not to double counting ce for jobs landed onto cream ce querystr = "select value,host, count(value) from (select distinct(short_fields.jobid), value, host from short_fields inner join events where events.code='15' and events.prog = 'WorkloadManager' and name='DEST_HOST' and time_stamp > '" + STARTDATE + "' and time_stamp <='" + ENDDATE + "' and value!='localhost' and value!='unavailable' and events.jobid=short_fields.jobid) as temp group by value, host;" ################################################## logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: rowCE = row[0][0] rowhost = row[0][1] rowCEcount = row[0][2] wmsFOUND = False for wmsdata in wmsdata_list: if wmsdata.host == rowhost: wmsFOUND = True try: wmsdata.add_ce(rowCE) wmsdata.add_ce_count(rowCE, rowCEcount) except wmsdata_class.CEPresent: # logger.warning('User Already present in wmdata for host: ' + wmsdata.host) wmsdata.add_CE_count(rowCEcount) if not wmsFOUND: wmsdata = wmsdata_class.wmsdata(rowhost) wmsdata.add_ce(rowCE) wmsdata.add_ce_count(rowCE, rowCEcount) wmsdata_list.append(wmsdata) # Run a MySQL query to find the LB used to store the jobs in a given time interval # Available only if DBTYPE = LBPROXY if DBTYPE == 'LBPROXY': logger.info( 'Run a MySQL query to find the LB used to store the jobs in a given time interval' ) querystr = "select distinct dg_jobid from jobs inner join events on jobs.jobid=events.jobid where events.code = '17' and time_stamp > '" + STARTDATE + "' and time_stamp < '" + ENDDATE + "';" logger.info('Query is : ' + querystr) db.query(querystr) r = db.store_result() # Iterate through the result set if r: for i in range(1, r.num_rows() + 1): row = r.fetch_row() if row: rowLB = row[0][0] LBstr = LBstr = rowLB[rowLB.find('//') + 2:rowLB.find(':9000')] for wmsdata in wmsdata_list: wmsdata.add_lb(LBstr) db.close() # filename= confvar['INSTALL_PATH'] +'/sensors//tmp/USERSTATS_' + lbhost + '_' + wmshost + '.txt' # fileusersstats = open(filename,'w') # fileusersstats.write('START OF FILE\n') # for i in range(0,len(users_stats)): # fileusersstats.write(str(users_stats[i][0]) + '|' + str(users_stats[i][1]) + '|' + str(users_stats[i][2]) + '|' + str(users_stats[i][3]) + '|' + str(users_stats[i][4]) + '|' + str(users_stats[i][5]) + '|' + str(users_stats[i][6]) + '|' + str(users_stats[i][7]) + '|' + str(users_stats[i][8]) + '|\n') # fileusersstats.write('END OF FILE\n') # fileusersstats.close() return wmsdata_list
#! /usr/bin/python # Main program to call sensor functions import os, commands, sys, fpformat sys.path.append('/opt/WMSMonitor/collector/bin/') import logging import logpredef logger = logging.getLogger('wms_balancing_arbiter') import readconf_func conf=readconf_func.readconf() def mail_notification(subject,body): try: #sending mail.... SENDMAIL = "/usr/sbin/sendmail" # sendmail location p = os.popen("%s -t" % SENDMAIL, "w") p.write("To: " + conf.get('LOAD_BALANCING_SITE_CONTACT') + "\n") p.write(subject) p.write("\n") # blank line separating headers from body p.write(body) sts = p.close() if sts != 0: logger.info("Sendmail exit status" + str(sts)) except Exception,e: logger.error("ERROR SENDING MAIL: " + str(e)) def wms_balancing_arbiter(): '''wms_balancing_arbiter() -> updating wms instances available behind an alias depending on the load of the instances according to the load metric provided by wms_balancing_metric function Return None if errors are raised during calculation. '''
def wms_usermapping(): #Initializing logger import logging import logpredef_wmslb logger = logging.getLogger('wms_usermapping') confvar=readconf_func.readconf(); timenow = str(int(time.mktime(time.localtime()))) hostname=socket.getfqdn() logger.info("server hostname is :" + hostname) if hostname == '': logger.error('Could not determine machine hostname! Exiting...') sys.exit(1) filename = confvar['INSTALL_PATH'] + '/sensors/tmp/USERSMAPPING.txt' f = open(filename,'w') f.write("START OF MAPPING TABLE\n") data = "\"" + time.strftime("%d %b",time.localtime()) + "\"" #deciding whether to use rotated log or not logfile = '' maxlog = int(confvar['MAX_ROTATED_LOG']) for i in range(maxlog + 1): if i == 0: if (os.access(confvar.get('GLITE_LOG_DIR') + '/wmproxy.log',os.F_OK) == True): logfile = logfile + confvar.get('GLITE_LOG_DIR') + '/wmproxy.log' else: fname = confvar.get('GLITE_LOG_DIR') + '/wmproxy.log.' + str(i) if (os.access(fname,os.F_OK) == True): std = os.popen("tail -2 " + fname + " | grep " + data) if len(std.readlines()): logfile = logfile + ' ' + fname cmd = "grep " + data + " " + logfile + " |grep -A1 CLIENT > " + confvar['INSTALL_PATH'] + "/sensors/tmp/tmpgrep.txt" if (os.system(cmd) == 0): logfile = confvar['INSTALL_PATH'] + "/sensors/tmp/tmpgrep.txt" cmd = "cat " + logfile + " | grep CLIENT| sed -e 's/.*DN: //g' -e 's/\/CN=proxy.*//g' |" + "grep -v " + data + " " + "| sort |uniq" std = os.popen(cmd) stdstr = std.readlines() if ( len(stdstr) > 0 ): for line in stdstr: ltmp = line.split('/CN=') user = ltmp[len(ltmp)-1] user = user.split('/')[0] cmd = 'grep -A1 "' + user.rstrip() + '" ' + logfile + " |grep Role |tail -1 | sed \'s/.*VOMS.*0 //g\'" std = os.popen(cmd) stdstr = std.readlines() if ( len(stdstr) > 0 ): istr = stdstr[0] for l in istr.split(' '): if l.find('Role')!= -1: VO = l.split('/')[1] VO_SUB = l[(l.find(VO)+len(VO)):-1] # CAPABILITY = l.split('/')[3].split('=')[1] # print 'USER: '******'\nVO = ' + VO + '\nRole = ' + ROLE + '\nCapability = ' + CAPABILITY # print line.rstrip(),' ',user.rstrip(), ' ', VO , ' ', VO_SUB f.write(line.rstrip() + ' | ' + VO + ' | ' + VO_SUB + '\n') else: logger.error("ERROR: Could not determine User /VO/Role/Capability ! \n") else: logger.error("FILE " + logfile + " NOT FOUND! Exiting...\n") f.write("END OF MAPPING TABLE\n") f.close() cmd = 'rm -f ' + logfile os.system(cmd)
def wms_usermapping(): #Initializing logger import logging import logpredef_wmslb logger = logging.getLogger('wms_usermapping') confvar = readconf_func.readconf() timenow = str(int(time.mktime(time.localtime()))) hostname = socket.getfqdn() logger.info("server hostname is :" + hostname) if hostname == '': logger.error('Could not determine machine hostname! Exiting...') sys.exit(1) filename = confvar['INSTALL_PATH'] + '/sensors/tmp/USERSMAPPING.txt' f = open(filename, 'w') f.write("START OF MAPPING TABLE\n") data = "\"" + time.strftime("%d %b", time.localtime()) + "\"" #deciding whether to use rotated log or not logfile = '' maxlog = int(confvar['MAX_ROTATED_LOG']) for i in range(maxlog + 1): if i == 0: if (os.access( confvar.get('GLITE_LOG_DIR') + '/wmproxy.log', os.F_OK) == True): logfile = logfile + confvar.get( 'GLITE_LOG_DIR') + '/wmproxy.log' else: fname = confvar.get('GLITE_LOG_DIR') + '/wmproxy.log.' + str(i) if (os.access(fname, os.F_OK) == True): std = os.popen("tail -2 " + fname + " | grep " + data) if len(std.readlines()): logfile = logfile + ' ' + fname cmd = "grep " + data + " " + logfile + " |grep -A1 CLIENT > " + confvar[ 'INSTALL_PATH'] + "/sensors/tmp/tmpgrep.txt" if (os.system(cmd) == 0): logfile = confvar['INSTALL_PATH'] + "/sensors/tmp/tmpgrep.txt" cmd = "cat " + logfile + " | grep CLIENT| sed -e 's/.*DN: //g' -e 's/\/CN=proxy.*//g' |" + "grep -v " + data + " " + "| sort |uniq" std = os.popen(cmd) stdstr = std.readlines() if (len(stdstr) > 0): for line in stdstr: ltmp = line.split('/CN=') user = ltmp[len(ltmp) - 1] user = user.split('/')[0] cmd = 'grep -A1 "' + user.rstrip( ) + '" ' + logfile + " |grep Role |tail -1 | sed \'s/.*VOMS.*0 //g\'" std = os.popen(cmd) stdstr = std.readlines() if (len(stdstr) > 0): istr = stdstr[0] for l in istr.split(' '): if l.find('Role') != -1: VO = l.split('/')[1] VO_SUB = l[(l.find(VO) + len(VO)):-1] # CAPABILITY = l.split('/')[3].split('=')[1] # print 'USER: '******'\nVO = ' + VO + '\nRole = ' + ROLE + '\nCapability = ' + CAPABILITY # print line.rstrip(),' ',user.rstrip(), ' ', VO , ' ', VO_SUB f.write(line.rstrip() + ' | ' + VO + ' | ' + VO_SUB + '\n') else: logger.error( "ERROR: Could not determine User /VO/Role/Capability ! \n" ) else: logger.error("FILE " + logfile + " NOT FOUND! Exiting...\n") f.write("END OF MAPPING TABLE\n") f.close() cmd = 'rm -f ' + logfile os.system(cmd)