def __init__(self, vm_limit, azure_config, skip_setup, local): self.vm_limit = vm_limit # user input self.budget = 0 self.timestamp = datetime.now() self.cost_pred = 0 self.wf_end = None self.jobs_terminated = False self.last_resched = None self.workflow = Workflow() self.logwatcher = LogWatcher() self.schedule = Schedule() manager = Machine() manager.status = MachineStatus.manager manager.condor_slot = 'manager' self.machines = [manager] boot_entry = ScheduleEntry(Job('boot', None), manager, self.timestamp, self.timestamp) boot_entry.real_start = self.timestamp boot_entry.real_end = self.timestamp boot_entry.status = EntryStatus.completed self.schedule.add_entry_host(boot_entry, manager) self.local = local if azure_config and not local: hostname = socket.gethostname() self.exp = AzureExperiment(azure_config, skip_setup=skip_setup, name=hostname) self.master_addr = socket.gethostbyname(hostname) self.user = azure_config.admin_username else: self.exp = self.master_addr = self.user = None
def initenviron(logname, maxdrones, debug=False, timeout=90, nanodebug=0, cmadebug=0): 'Initialize the test environment.' logwatch = LogWatcher(logname, [], timeout, returnonlymatch=True, debug=debug) logwatch.setwatch() sysenv = SystemTestEnvironment(logname, maxdrones, nanodebug=nanodebug, cmadebug=cmadebug) CMAinit(None, host=str(sysenv.cma.ipaddr), readonly=True, neologin=SystemTestEnvironment.NEO4JLOGIN, neopass=SystemTestEnvironment.NEO4JPASS) url = 'http://%s:%d/db/data/' % (sysenv.cma.ipaddr, 7474) print >> sys.stderr, 'OPENING Neo4j at URL %s' % url neo4j.authenticate('%s:7474' % sysenv.cma.ipaddr, SystemTestEnvironment.NEO4JLOGIN, SystemTestEnvironment.NEO4JPASS) store = Store(neo4j.Graph(url), readonly=True) for classname in GN.GraphNode.classmap: GN.GraphNode.initclasstypeobj(store, classname) logger('$(grep MemFree: /proc/meminfo)', hardquote=False) tq = QueryTest(store , '''START drone=node:Drone('*:*') WHERE drone.status = "up" RETURN drone''' , GN.nodeconstructor, debug=debug) if not tq.check([None,], minrows=maxdrones+1, maxrows=maxdrones+1 , delay=0.5, maxtries=20): sysenv.cma.cleanupwhendone = False raise RuntimeError('Query of "up" status failed. Weirdness') return sysenv, store
def run(self, nano=None, debug=None, timeout=240): 'Actually start the nanoprobe and see if it worked' if debug is None: debug = self.debug if nano is None: nanozero = self.testenviron.select_nano_noservice() if len(nanozero) > 0: nano = nanozero[0] if (nano is None or nano.status != TestSystem.RUNNING or SystemTestEnvironment.NANOSERVICE in nano.runningservices): return self._record(AssimSysTest.SKIPPED) regexes = self.nano_start_regexes(nano) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" ''' '''RETURN drone''') nano.startservice(SystemTestEnvironment.NANOSERVICE) return self.checkresults(watch, timeout, qstr, None, nano)
def initenviron(logname, maxdrones, mgmtsystem, debug=False, cmaimage='', nanoimages=[], timeout=90, nanodebug=0, cmadebug=0): 'Initialize the test environment.' logwatch = LogWatcher(logname, [], timeout, returnonlymatch=True, debug=debug) logwatch.setwatch() sysenv = SystemTestEnvironment(logname, maxdrones, mgmtsystem, cmaimage=cmaimage, nanoimages=nanoimages, nanodebug=nanodebug, cmadebug=cmadebug) CMAinit(None, host=str(sysenv.cma.ipaddr), readonly=True, neologin=SystemTestEnvironment.NEO4JLOGIN, neopass=SystemTestEnvironment.NEO4JPASS) url = 'http://%s:%d/db/data/' % (sysenv.cma.ipaddr, 7474) print >> sys.stderr, 'OPENING Neo4j at URL %s' % url neo4j.authenticate('%s:7474' % sysenv.cma.ipaddr, SystemTestEnvironment.NEO4JLOGIN, SystemTestEnvironment.NEO4JPASS) store = Store(neo4j.Graph(url), readonly=True) for classname in GN.GraphNode.classmap: GN.GraphNode.initclasstypeobj(store, classname) logger('$(grep MemFree: /proc/meminfo)', hardquote=False) tq = QueryTest( store, '''START drone=node:Drone('*:*') WHERE drone.status = "up" RETURN drone''', GN.nodeconstructor, debug=debug) if not tq.check([ None, ], minrows=maxdrones + 1, maxrows=maxdrones + 1, delay=0.5, maxtries=20): sysenv.cma.cleanupwhendone = False raise RuntimeError('Query of "up" status failed. Weirdness') return sysenv, store
def testmain(logname, maxdrones=3, debug=False): 'Test our test cases' logger('Starting test of our test cases') try: sysenv, ourstore = AssimSysTest.initenviron(logname, maxdrones, debug, cmadebug=5, nanodebug=3) except AssertionError: print 'FAILED initial startup - which is pretty basic' print 'Any chance you have another CMA running??' raise RuntimeError('Another CMA is running(?)') badregexes = ( ' ERROR: ', ' CRIT: ', ' CRITICAL: ' # 'HBDEAD' #, r'Peer at address .* is dead' , r'OUTALLDONE .* while in state NONE') #for cls in [SimulCMAandNanoprobeRestart for j in range(0,20)]: #for j in range(0,10): #for cls in [DiscoverService for j in range(0,100)]: for cls in AssimSysTest.testset: badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0) logger('CREATED LOG WATCH with %s' % str(badregexes)) badwatch.setwatch() logger('Starting test %s' % (cls.__name__)) if cls is DiscoverService: ret = cls(ourstore, logname, sysenv, debug=debug, service='ssh', monitorname='check_ssh').run() else: ret = cls(ourstore, logname, sysenv, debug=debug).run() #print >> sys.stderr, 'Got return of %s from test %s' % (ret, cls.__name__) badmatch = badwatch.look(timeout=1) if badmatch is not None: print 'OOPS! Got bad results!', badmatch raise RuntimeError('Test %s said bad words! [%s]' % (cls.__name__, badmatch)) assert ret == AssimSysTest.SUCCESS or ret == AssimSysTest.SKIPPED #assert ret == AssimSysTest.SUCCESS logger('WOOT! All tests were successful!')
def testmain(logname, maxdrones=25, debug=False): "A simple test main program" regexes = [] # pylint says: [W0612:testmain] Unused variable 'j' # pylint: disable=W0612 for j in range(0, maxdrones + 1): regexes.append("Stored packages JSON data from *([^ ]*) ") logwatch = LogWatcher(logname, regexes, timeout=90, returnonlymatch=True) logwatch.setwatch() sysenv = SystemTestEnvironment(maxdrones) print >> sys.stderr, "Systems all up and running." url = "http://%s:%d/db/data/" % (sysenv.cma.ipaddr, 7474) CMAinit(None) store = Store(neo4j.GraphDatabaseService(url), readonly=True) for classname in GN.GraphNode.classmap: GN.GraphNode.initclasstypeobj(store, classname) results = logwatch.lookforall() if debug: print >> sys.stderr, "WATCH RESULTS:", results tq = QueryTest(store, "START drone=node:Drone('*:*') RETURN drone", GN.nodeconstructor, debug=debug) print >> sys.stderr, "Running Query" if tq.check([None], minrows=maxdrones + 1, maxrows=maxdrones + 1): print "WOOT! Systems passed query check after initial startup!" else: print "Systems FAILED initial startup query check" print "Do you have a second CMA running??" print "Rerunning query with debug=True" tq.debug = True tq.check([None], minrows=maxdrones + 1, maxrows=maxdrones + 1) return 1 cma = sysenv.cma nano = sysenv.nanoprobes[0] regex = r"%s cma INFO: System %s at \[::ffff:%s]:1984 reports graceful shutdown" % ( cma.hostname, nano.hostname, nano.ipaddr, ) # print >> sys.stderr, 'REGEX IS: [%s]' % regex logwatch = LogWatcher(logname, [regex], timeout=30, returnonlymatch=False) logwatch.setwatch() nano.stopservice(SystemTestEnvironment.NANOSERVICE) logwatch.look() time.sleep(30) tq = QueryTest( store, ("""START drone=node:Drone('*:*') """ """WHERE drone.designation = "{0.hostname}" RETURN drone"""), GN.nodeconstructor, debug=debug, ) if tq.check([nano], downbyshutdown, maxrows=1): print "WOOT! Systems passed query check after nano shutdown!" else: print "Systems FAILED query check after nano shutdown"
def run(self, nano=None, debug=None, timeout=60): 'Actually stop and start (restart) the CMA and see if it worked' if debug is None: debug = self.debug cma = self.testenviron.cma cma.stopservice(SystemTestEnvironment.CMASERVICE) regexes = self.cma_start_regexes() watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() if self.delay > 0: time.sleep(self.delay) cma.startservice(SystemTestEnvironment.CMASERVICE) # This just makes sure the database is still up - which it should be... # Once we receive the CMA update message, we really should already be good to go qstr = '''START one=node(*) RETURN one LIMIT 1''' return self.checkresults(watch, timeout, qstr, None, nano)
def __init__(self): self.workflow = Workflow() self.creation_timestamp = self.timestamp = datetime.now() self.logwatcher = LogWatcher() manager = Machine() manager.status = MachineStatus.manager manager.condor_slot = 'local' self.machines = [manager] boot_entry = ScheduleEntry(Job('boot', None), manager, None, None) boot_entry.real_start = self.timestamp boot_entry.real_end = self.timestamp boot_entry.status = EntryStatus.completed self.entries = [boot_entry] self.entries_cid = {}
def perform_tests(testset, sysenv, store, itermax, logname, debug=False): 'Actually perform the given set of tests the given number of times, etc' badregexes = ( r' (ERROR:|CRIT:|CRITICAL:|nanoprobe\[[0-9]*]: segfault at|' #r'Peer at address .* is dead|' r'OUTALLDONE .* while in state NONE' r')', ) itercount = 1 while True: test = random.choice(testset) badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0) logit("STARTING test %d - %s" % (itercount, test.__name__)) os.system('logger -s "Load Avg: $(cat /proc/loadavg)"') os.system('logger -s "$(grep MemFree: /proc/meminfo)"') badwatch.setwatch() if test.__name__ == 'DiscoverService': testobj = test(store, logname, sysenv, debug=debug, service='ssh', monitorname='check_ssh') else: testobj = test(store, logname, sysenv, debug=debug) ret = testobj.run() match = badwatch.look() if match is not None: logit('BAD MESSAGE from Test %d %s: %s' % (itercount, test.__name__, match)) testobj.replace_result(AssimSysTest.FAIL) ret = AssimSysTest.FAIL if ret == AssimSysTest.SUCCESS: logit('Test %d %s succeeded!' % (itercount, test.__name__)) itercount += 1 elif ret == AssimSysTest.FAIL: logit('Test %d %s FAILED :-(' % (itercount, test.__name__)) itercount += 1 elif ret == AssimSysTest.SKIPPED: logit('Test %d %s skipped' % (itercount, test.__name__)) else: logit('Test %d %s RETURNED SOMETHING REALLY WEIRD [%s]' % (itercount, test.__name__, str(ret))) testobj.replace_result(AssimSysTest.FAIL) print '' if itercount > itermax: break return summarize_tests()
def run(self, nano=None, debug=None, timeout=60): 'Actually stop and start (restart) the CMA and see if it worked' if debug is None: debug = self.debug cma = self.testenviron.cma cma.stopservice(SystemTestEnvironment.CMASERVICE) regex = (' %s .* INFO: Neo4j version .* // py2neo version .*' ' // Python version .* // java version.*') % cma.hostname watch = LogWatcher(self.logfilename, (regex,), timeout=timeout, debug=debug) watch.setwatch() if self.delay > 0: time.sleep(self.delay) cma.startservice(SystemTestEnvironment.CMASERVICE) # This just makes sure the database is still up - which it should be... # Once we receive the CMA update message, we really should already be good to go qstr = '''START one=node(*) RETURN one LIMIT 1''' return self.checkresults(watch, timeout, qstr, None, nano)
def __init__(self, logname, nanocount=10 , cmaimage='assimilation/build-wily', nanoimages=('assimilation/build-wily',) #, cmaimage='3f06b7c84030', nanoimages=('3f06b7c84030',) , sysclass=DockerSystem, cleanupwhendone=False, nanodebug=0, cmadebug=0, chunksize=20): 'Init/constructor for our SystemTestEnvironment' self.sysclass = sysclass self.cmaimage = cmaimage self.nanoimages = nanoimages self.nanoprobes = [] self.cma = None self.debug = 0 self.cleanupwhendone = cleanupwhendone self.logname = logname watch = LogWatcher(logname, []) watch.setwatch() nanodebug=1 cmadebug=2 self.nanodebug = nanodebug self.cmadebug = nanodebug self.spawncma(nanodebug=nanodebug, cmadebug=cmadebug) regex = (' %s .* INFO: Neo4j version .* // py2neo version .*' ' // Python version .* // (java|openjdk) version.*') % self.cma.hostname watch.setregexes((regex,)) if watch.lookforall(timeout=120) is None: os.system("logger -s 'CMA did not start!! [[%s]]'" % (regex)) print >> sys.stderr, 'CMA did not start!! %s' % regex raise RuntimeError('CMA did not start: %s' % regex) print >> sys.stderr, 'nanocount is', nanocount print >> sys.stderr, 'self.nanoimages is', self.nanoimages # We do this in chunks to manage stress on our test environment children_left = range(0, nanocount) while (len(children_left) > 0): self._create_nano_chunk(children_left[0:chunksize]) del children_left[0:chunksize]
def run(self, nano=None, debug=None, timeout=180): 'Actually stop the nanoprobe and see if it worked' if debug is None: debug = self.debug if nano is None: nanozero = self.testenviron.select_nano_service() if len(nanozero) > 0: nano = nanozero[0] if (nano is None or nano.status != TestSystem.RUNNING or SystemTestEnvironment.NANOSERVICE not in nano.runningservices): return self._record(AssimSysTest.SKIPPED) regexes = self.nano_stop_regexes(nano) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "dead" ''' '''and drone.reason = "HBSHUTDOWN" RETURN drone''') nano.stopservice(SystemTestEnvironment.NANOSERVICE) return self.checkresults(watch, timeout, qstr, None, nano)
def perform_tests(testset, sysenv, store, itermax, logname, debug=False): 'Actually perform the given set of tests the given number of times, etc' badregexes=(r' (ERROR:|CRIT:|CRITICAL:|nanoprobe\[[0-9]*]: segfault at|' #r'Peer at address .* is dead|' r'OUTALLDONE .* while in state NONE' r')',) itercount=1 while True: test = random.choice(testset) badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0) logit("STARTING test %d - %s" % (itercount, test.__name__)) os.system('logger -s "Load Avg: $(cat /proc/loadavg)"') os.system('logger -s "$(grep MemFree: /proc/meminfo)"') badwatch.setwatch() if test.__name__ == 'DiscoverService': testobj = test(store, logname, sysenv, debug=debug , service='ssh', monitorname='check_ssh') else: testobj = test(store, logname, sysenv, debug=debug) ret = testobj.run() match = badwatch.look() if match is not None: logit('BAD MESSAGE from Test %d %s: %s' % (itercount, test.__name__, match)) testobj.replace_result(AssimSysTest.FAIL) ret = AssimSysTest.FAIL if ret == AssimSysTest.SUCCESS: logit('Test %d %s succeeded!' % (itercount, test.__name__)) itercount += 1 elif ret == AssimSysTest.FAIL: logit('Test %d %s FAILED :-(' % (itercount, test.__name__)) itercount += 1 elif ret == AssimSysTest.SKIPPED: logit('Test %d %s skipped' % (itercount, test.__name__)) else: logit('Test %d %s RETURNED SOMETHING REALLY WEIRD [%s]' % (itercount, test.__name__, str(ret))) testobj.replace_result(AssimSysTest.FAIL) print '' if itercount > itermax: break return summarize_tests()
def __init__(self, logname, nanocount=10 , cmaimage='assimilation/build-utopic', nanoimages=('assimilation/build-utopic',) , sysclass=DockerSystem, cleanupwhendone=True, nanodebug=0, cmadebug=0, chunksize=20): 'Init/constructor for our SystemTestEnvironment' self.sysclass = sysclass self.cmaimage = cmaimage self.nanoimages = nanoimages self.nanoprobes = [] self.cma = None self.debug = 0 self.cleanupwhendone = cleanupwhendone self.logname = logname watch = LogWatcher(logname, []) watch.setwatch() nanodebug=1 cmadebug=0 self.nanodebug = nanodebug self.cmadebug = nanodebug self.spawncma(nanodebug=nanodebug, cmadebug=cmadebug) regex = (' %s .* INFO: Neo4j version .* // py2neo version .*' ' // Python version .* // java version.*') % self.cma.hostname watch.setregexes((regex,)) if watch.lookforall(timeout=60) is None: print >> sys.stderr, 'CMA did not start!!' raise RuntimeError('CMA did not start') print >> sys.stderr, 'nanocount is', nanocount print >> sys.stderr, 'self.nanoimages is', self.nanoimages # We do this in chunks to manage stress on our test environment children_left = range(0, nanocount) while (len(children_left) > 0): self._create_nano_chunk(children_left[0:chunksize]) del children_left[0:chunksize]
def testmain(logname, maxdrones=3, debug=False): 'Test our test cases' logger('Starting test of our test cases') try: sysenv, ourstore = AssimSysTest.initenviron(logname, maxdrones, debug , cmadebug=5, nanodebug=3) except AssertionError: print 'FAILED initial startup - which is pretty basic' print 'Any chance you have another CMA running??' raise RuntimeError('Another CMA is running(?)') badregexes=(' ERROR: ', ' CRIT: ', ' CRITICAL: ' # 'HBDEAD' #, r'Peer at address .* is dead' , r'OUTALLDONE .* while in state NONE' ) #for cls in [SimulCMAandNanoprobeRestart for j in range(0,20)]: #for j in range(0,10): #for cls in [DiscoverService for j in range(0,100)]: for cls in AssimSysTest.testset: badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0) logger('CREATED LOG WATCH with %s' % str(badregexes)) badwatch.setwatch() logger('Starting test %s' % (cls.__name__)) if cls is DiscoverService: ret = cls(ourstore, logname, sysenv, debug=debug , service='ssh', monitorname='check_ssh').run() else: ret = cls(ourstore, logname, sysenv, debug=debug).run() #print >> sys.stderr, 'Got return of %s from test %s' % (ret, cls.__name__) badmatch = badwatch.look(timeout=1) if badmatch is not None: print 'OOPS! Got bad results!', badmatch raise RuntimeError('Test %s said bad words! [%s]' % (cls.__name__, badmatch)) assert ret == AssimSysTest.SUCCESS or ret == AssimSysTest.SKIPPED #assert ret == AssimSysTest.SUCCESS logger('WOOT! All tests were successful!')
def __init__(self): self.workflow = Workflow() self.creation_timestamp = self.timestamp = datetime.now() self.logwatcher = LogWatcher() manager = Machine() manager.status = MachineStatus.manager manager.condor_slot = 'local' self.machines = [manager] boot_entry = ScheduleEntry(Job('boot', None), manager, None, None) boot_entry.real_start = self.timestamp boot_entry.real_end = self.timestamp boot_entry.status = EntryStatus.completed self.entries = [boot_entry] self.entries_cid = {}
def run(self, nano=None, debug=None, timeout=300): '''Our default timeout is so long because we can take a while to give up shutting down the nanoprobe - an ACK timeout might have to occur before it can shut down. ''' if debug is None: debug = self.debug if nano is None: nanozero = self.testenviron.select_nano_service() if len(nanozero) < 1: return self._record(AssimSysTest.SKIPPED) nano = nanozero[0] cma = self.testenviron.cma regexes = self.nano_stop_regexes(nano) regexes.extend(self.cma_stop_regexes()) regexes.extend(self.cma_start_regexes()) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() cma.stopservice(SystemTestEnvironment.CMASERVICE) nano.stopservice(SystemTestEnvironment.NANOSERVICE, async=True) cma.startservice(SystemTestEnvironment.CMASERVICE) if self.delay > 0: time.sleep(self.delay) qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "dead" ''' '''and drone.reason = "HBSHUTDOWN" RETURN drone''') rc = self.checkresults(watch, timeout, qstr, None, nano) if rc != AssimSysTest.SUCCESS: return rc # We have to do this in two parts because of the asynchronous shutdown above regexes = self.nano_start_regexes(nano) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() nano.startservice(SystemTestEnvironment.NANOSERVICE) qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" ''' '''RETURN drone''') return self.checkresults(watch, timeout, qstr, None, nano)
def _create_nano_chunk(self, childnos): 'Create a chunk of nanoprobes' watch = LogWatcher(self.logname, []) watch.setwatch() regexes = [] for childcount in childnos: childcount = childcount # Make pylint happy... nano = self.spawnnanoprobe(debug=self.nanodebug) regexes.extend([ r' (%s) nanoprobe\[.*]: NOTICE: Connected to CMA. Happiness :-D' % (nano.hostname), r' %s cma INFO: Drone %s registered from address \[::ffff:%s]' % (self.cma.hostname, nano.hostname, nano.ipaddr), r' %s cma INFO: Processed tcpdiscovery JSON data from (%s) into graph.' % (self.cma.hostname, nano.hostname), ]) self.nanoprobes.append(nano) watch.setregexes(regexes) if watch.lookforall(timeout=30) is None: raise RuntimeError( 'Nanoprobes did not start [%s, %s] - missing %s' % (nano.hostname, nano.ipaddr, str(watch.unmatched)))
def _create_nano_chunk(self, childnos): 'Create a chunk of nanoprobes' watch = LogWatcher(self.logname, [], debug=0) watch.setwatch() regexes = [] for childcount in childnos: childcount = childcount # Make pylint happy... nano = self.spawnnanoprobe(debug=self.nanodebug) regexes .extend([ r' %s nanoprobe\[.*]: NOTICE: Connected to CMA. Happiness :-D' % (nano.hostname), r' %s cma INFO: Drone %s registered from address \[::ffff:%s]' % (self.cma.hostname, nano.hostname, nano.ipaddr), r' %s cma INFO: Processed u?n?changed tcpdiscovery' r' JSON data from %s into graph.' % (self.cma.hostname, nano.hostname), ]) self.nanoprobes.append(nano) print >> sys.stderr, len(regexes), 'NANOPROBE REGEXES ARE:', (regexes) watch.setregexes(regexes) if watch.lookforall(timeout=120) is None: raise RuntimeError('Nanoprobes did not start - missing %s' % (str(watch.unmatched)))
def _create_nano_chunk(self, childnos): 'Create a chunk of nanoprobes' watch = LogWatcher(self.logname, []) watch.setwatch() regexes = [] for childcount in childnos: childcount = childcount # Make pylint happy... nano = self.spawnnanoprobe(debug=self.nanodebug) regexes .extend([ r' (%s) nanoprobe\[.*]: NOTICE: Connected to CMA. Happiness :-D' % (nano.hostname), r' %s cma INFO: Drone %s registered from address \[::ffff:%s]' % (self.cma.hostname, nano.hostname, nano.ipaddr), r' %s cma INFO: Processed tcpdiscovery JSON data from (%s) into graph.' % (self.cma.hostname, nano.hostname), ]) self.nanoprobes.append(nano) watch.setregexes(regexes) if watch.lookforall(timeout=30) is None: raise RuntimeError('Nanoprobes did not start [%s, %s] - missing %s' % (nano.hostname, nano.ipaddr, str(watch.unmatched)))
def run(self, nano=None, debug=None, timeout=300): '''Our default timeout is so long because we can take a while to give up shutting down the nanoprobe - an ACK timeout might have to occur before it can shut down. ''' if debug is None: debug = self.debug if nano is None: nanozero = self.testenviron.select_nano_service() if len(nanozero) < 1: return self._record(AssimSysTest.SKIPPED) nano = nanozero[0] cma = self.testenviron.cma regexes = self.nano_stop_regexes(nano) regexes.extend(self.cma_stop_regexes()) regexes.extend(self.cma_start_regexes()) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() cma.stopservice(SystemTestEnvironment.CMASERVICE) nano.stopservice(SystemTestEnvironment.NANOSERVICE, async=True) cma.startservice(SystemTestEnvironment.CMASERVICE) if self.delay > 0: time.sleep(self.delay) qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "dead" ''' '''and drone.reason = "HBSHUTDOWN" RETURN drone''') rc = self.checkresults(watch, timeout, qstr, None, nano) if rc != AssimSysTest.SUCCESS: return rc # We have to do this in two parts because of the asynchronous shutdown above regexes = self.nano_start_regexes(nano) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() nano.startservice(SystemTestEnvironment.NANOSERVICE) qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" ''' '''RETURN drone''') return self.checkresults(watch, timeout, qstr, None, nano)
#!/usr/bin/python from pymongo import MongoClient from logwatcher import LogWatcher import datetime import json header = ["time", "duration", "client_addr", "result_code", "bytes", "req_method", "URL", "user", "hier_code", "type"] COL = len(header) client = MongoClient('localhost', 27017) db = client.squid accesslog = db.accesslog def callback(filename, lines): for line in lines: rec = list(line.split()) if len(rec) != COL : continue record = dict(zip(header, rec)) # print json.dumps(record, indent=2) result = accesslog.insert_one(record) print result.inserted_id lw = LogWatcher("/var/log/squid3", callback) lw.loop()
class Monitor(): def __init__(self): self.workflow = Workflow() self.creation_timestamp = self.timestamp = datetime.now() self.logwatcher = LogWatcher() manager = Machine() manager.status = MachineStatus.manager manager.condor_slot = 'local' self.machines = [manager] boot_entry = ScheduleEntry(Job('boot', None), manager, None, None) boot_entry.real_start = self.timestamp boot_entry.real_end = self.timestamp boot_entry.status = EntryStatus.completed self.entries = [boot_entry] self.entries_cid = {} def add_workflow(self, workflow_dir): wf_id = self.workflow.add_workflow(workflow_dir) self.logwatcher.add(wf_id, workflow_dir) def sync_machines(self): slots = condor_slots() for s in slots: if s not in [m.condor_slot for m in self.machines]: machine = Machine() machine.status = MachineStatus.running machine.condor_slot = s boot_job = Job('boot', None) boot_entry = ScheduleEntry(boot_job, machine, None, None) boot_entry.log[LogKey.real_start] = self.creation_timestamp boot_entry.log[LogKey.real_end] = self.timestamp boot_entry.status = EntryStatus.completed self.entries.append(boot_entry) self.machines.append(machine) print "++Machine", s def sync_jobs(self): log_entries = self.logwatcher.nexts() for le in log_entries: if le.id in self.entries_cid: # in dict keys entry = self.entries_cid[le.id] else: entry = ScheduleEntry(condor_id=le.id) self.entries.append(entry) self.entries_cid[le.id] = entry print "++Job", le.id entry.log[le.event] = le.timestamp if le.event == LogKey.execute: entry.status = EntryStatus.executing elif le.event == LogKey.job_terminated: entry.status = EntryStatus.completed wf_id, dag_job_id, slot = condor_history(le.id) job = next( (j for j in self.workflow.jobs if j.dag_job_id == dag_job_id and j.wf_id == wf_id), None) if job: entry.job = job entry.host = next( (m for m in self.machines if m.condor_slot == slot), self.machines[0]) print "--Job", le.id, dag_job_id, entry.host.condor_slot def update_timestamp(self): self.timestamp = datetime.now()
def run(self, nano=None, debug=None, timeout=240, service=None, monitorname=None): if debug is None: debug = self.debug if service is None: service = self.service if monitorname is None: monitorname = self.monitorname if nano is None: nanozero = self.testenviron.select_nano_noservice(service=service) if nanozero is None or len(nanozero) < 1: return self._record(AssimSysTest.SKIPPED) else: nano = nanozero[0] assert service not in nano.runningservices if SystemTestEnvironment.NANOSERVICE not in nano.runningservices: startregexes = self.nano_start_regexes(nano) watch = LogWatcher(self.logfilename, startregexes, timeout=timeout, debug=debug) watch.setwatch() nano.startservice(SystemTestEnvironment.NANOSERVICE) match = watch.look(timeout=timeout) if match is None: logger('ERROR: Test %s timed out waiting for any of %s [timeout:%s]' % (self.__class__.__name__, str(watch.regexes), timeout)) return self._record(AssimSysTest.FAIL) regexes = self.nano_stop_regexes(nano) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() nano.stopservice(SystemTestEnvironment.NANOSERVICE) if watch.lookforall(timeout=timeout) is None: logger('ERROR: Test %s timed out waiting for all of %s [timeout:%s]' % (self.__class__.__name__, str(watch.unmatched), timeout)) return self._record(AssimSysTest.FAIL) regexes = self.nano_start_regexes(nano) regexes.extend(self.nano_startmonitor_regexes(nano, monitorname)) regexes.extend(self.nano_service_start_regexes(nano, monitorname)) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() nano.startservice(service) nano.startservice(SystemTestEnvironment.NANOSERVICE) if watch.lookforall(timeout=timeout) is None: logger('ERROR: Test %s timed out waiting for all of %s [timeout:%s]' % (self.__class__.__name__, str(watch.unmatched), timeout)) return self._record(AssimSysTest.FAIL) # @TODO make a better query # but it should be enough to let us validate the rest qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" ''' '''RETURN drone''') return self.checkresults(watch, timeout, qstr, None, nano, debug=debug)
class Monitor(): def __init__(self): self.workflow = Workflow() self.creation_timestamp = self.timestamp = datetime.now() self.logwatcher = LogWatcher() manager = Machine() manager.status = MachineStatus.manager manager.condor_slot = 'local' self.machines = [manager] boot_entry = ScheduleEntry(Job('boot', None), manager, None, None) boot_entry.real_start = self.timestamp boot_entry.real_end = self.timestamp boot_entry.status = EntryStatus.completed self.entries = [boot_entry] self.entries_cid = {} def add_workflow(self, workflow_dir): wf_id = self.workflow.add_workflow(workflow_dir) self.logwatcher.add(wf_id, workflow_dir) def sync_machines(self): slots = condor_slots() for s in slots: if s not in [m.condor_slot for m in self.machines]: machine = Machine() machine.status = MachineStatus.running machine.condor_slot = s boot_job = Job('boot', None) boot_entry = ScheduleEntry(boot_job, machine, None, None) boot_entry.log[LogKey.real_start] = self.creation_timestamp boot_entry.log[LogKey.real_end] = self.timestamp boot_entry.status = EntryStatus.completed self.entries.append(boot_entry) self.machines.append(machine) print "++Machine", s def sync_jobs(self): log_entries = self.logwatcher.nexts() for le in log_entries: if le.id in self.entries_cid: # in dict keys entry = self.entries_cid[le.id] else: entry = ScheduleEntry(condor_id=le.id) self.entries.append(entry) self.entries_cid[le.id] = entry print "++Job", le.id entry.log[le.event] = le.timestamp if le.event == LogKey.execute: entry.status = EntryStatus.executing elif le.event == LogKey.job_terminated: entry.status = EntryStatus.completed wf_id, dag_job_id, slot = condor_history(le.id) job = next((j for j in self.workflow.jobs if j.dag_job_id == dag_job_id and j.wf_id == wf_id), None) if job: entry.job = job entry.host = next((m for m in self.machines if m.condor_slot == slot), self.machines[0]) print "--Job", le.id, dag_job_id, entry.host.condor_slot def update_timestamp(self): self.timestamp = datetime.now()
#!/usr/bin/python from pymongo import MongoClient from logwatcher import LogWatcher import datetime import json header = [ "time", "duration", "client_addr", "result_code", "bytes", "req_method", "URL", "user", "hier_code", "type" ] COL = len(header) client = MongoClient('localhost', 27017) db = client.squid accesslog = db.accesslog def callback(filename, lines): for line in lines: rec = list(line.split()) if len(rec) != COL: continue record = dict(zip(header, rec)) # print json.dumps(record, indent=2) result = accesslog.insert_one(record) print result.inserted_id lw = LogWatcher("/var/log/squid3", callback) lw.loop()
def testmain(logname, maxdrones=25, debug=False): 'A simple test main program' regexes = [] #pylint says: [W0612:testmain] Unused variable 'j' #pylint: disable=W0612 for j in range(0, maxdrones + 1): regexes.append('Stored packages JSON data from *([^ ]*) ') logwatch = LogWatcher(logname, regexes, timeout=90, returnonlymatch=True) logwatch.setwatch() sysenv = SystemTestEnvironment(maxdrones) print >> sys.stderr, 'Systems all up and running.' url = ('http://%s:%d/db/data/' % (sysenv.cma.ipaddr, 7474)) CMAinit(None) store = Store(neo4j.Graph(url), readonly=True) for classname in GN.GraphNode.classmap: GN.GraphNode.initclasstypeobj(store, classname) results = logwatch.lookforall() if debug: print >> sys.stderr, 'WATCH RESULTS:', results tq = QueryTest(store, "START drone=node:Drone('*:*') RETURN drone", GN.nodeconstructor, debug=debug) print >> sys.stderr, 'Running Query' if tq.check([ None, ], minrows=maxdrones + 1, maxrows=maxdrones + 1): print 'WOOT! Systems passed query check after initial startup!' else: print 'Systems FAILED initial startup query check' print 'Do you have a second CMA running??' print 'Rerunning query with debug=True' tq.debug = True tq.check([ None, ], minrows=maxdrones + 1, maxrows=maxdrones + 1) return 1 cma = sysenv.cma nano = sysenv.nanoprobes[0] regex = ( r'%s cma INFO: System %s at \[::ffff:%s]:1984 reports graceful shutdown' % (cma.hostname, nano.hostname, nano.ipaddr)) #print >> sys.stderr, 'REGEX IS: [%s]' % regex logwatch = LogWatcher(logname, [ regex, ], timeout=30, returnonlymatch=False) logwatch.setwatch() nano.stopservice(SystemTestEnvironment.NANOSERVICE) logwatch.look() time.sleep(30) tq = QueryTest( store, ('''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" RETURN drone'''), GN.nodeconstructor, debug=debug) if tq.check([ nano, ], downbyshutdown, maxrows=1): print 'WOOT! Systems passed query check after nano shutdown!' else: print 'Systems FAILED query check after nano shutdown'
def run(self, nano=None, debug=None, timeout=240, service=None, monitorname=None): if debug is None: debug = self.debug if service is None: service = self.service if monitorname is None: monitorname = self.monitorname if nano is None: nanozero = self.testenviron.select_nano_noservice(service=service) if nanozero is None or len(nanozero) < 1: return self._record(AssimSysTest.SKIPPED) else: nano = nanozero[0] assert service not in nano.runningservices if SystemTestEnvironment.NANOSERVICE not in nano.runningservices: startregexes = self.nano_start_regexes(nano) watch = LogWatcher(self.logfilename, startregexes, timeout=timeout, debug=debug) watch.setwatch() nano.startservice(SystemTestEnvironment.NANOSERVICE) match = watch.look(timeout=timeout) if match is None: logger( 'ERROR: Test %s timed out waiting for any of %s [timeout:%s]' % (self.__class__.__name__, str(watch.regexes), timeout)) return self._record(AssimSysTest.FAIL) regexes = self.nano_stop_regexes(nano) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() nano.stopservice(SystemTestEnvironment.NANOSERVICE) if watch.lookforall(timeout=timeout) is None: logger( 'ERROR: Test %s timed out waiting for all of %s [timeout:%s]' % (self.__class__.__name__, str(watch.unmatched), timeout)) return self._record(AssimSysTest.FAIL) regexes = self.nano_start_regexes(nano) regexes.extend(self.nano_startmonitor_regexes(nano, monitorname)) regexes.extend(self.nano_service_start_regexes(nano, monitorname)) watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug) watch.setwatch() nano.startservice(service) nano.startservice(SystemTestEnvironment.NANOSERVICE) if watch.lookforall(timeout=timeout) is None: logger( 'ERROR: Test %s timed out waiting for all of %s [timeout:%s]' % (self.__class__.__name__, str(watch.unmatched), timeout)) return self._record(AssimSysTest.FAIL) # @TODO make a better query # but it should be enough to let us validate the rest qstr = ( '''START drone=node:Drone('*:*') ''' '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" ''' '''RETURN drone''') return self.checkresults(watch, timeout, qstr, None, nano, debug=debug)
print_log('INFO: Total write lines = ' + str(len(lines))) def write2HDFS(localpath, hdfspath): print_log('INFO: Append local file to hdfs. ' + localpath + '-> hdfs://' + hdfspath ) p = Popen([HDFS_BIN, 'dfs', '-appendToFile', localpath, hdfspath], stdout=PIPE, stderr=PIPE) result = p.stdout.read() if result: print result result_err = p.stderr.read() if result_err: print result_err def makeHDFSDir(newpath): print_log('INFO: Make the HDFS dir = hdfs://' + newpath) p = Popen([HDFS_BIN, 'dfs', '-mkdir', newpath], stdout=PIPE, stderr=PIPE) result = p.stdout.read() if result: print result result_err = p.stderr.read() if result_err: print result_err def print_log(msg): print time.strftime("%Y/%m/%d %H:%M:%S") + '\t' + msg if __name__ == '__main__': watcher = LogWatcher(LOG_DIR + LOG_FILE, callback, True) watcher.loop(TIME_CYCLE_SEC) print "Exiting Main Thread"
def main(): """Starts the LogWatcher process. :param String dataPath: Directory where the data will be stored. """ lw = LogWatcher(dataPath)
class Provisioner(): def __init__(self, vm_limit, azure_config, skip_setup, local): self.vm_limit = vm_limit # user input self.budget = 0 self.timestamp = datetime.now() self.cost_pred = 0 self.wf_end = None self.jobs_terminated = False self.last_resched = None self.workflow = Workflow() self.logwatcher = LogWatcher() self.schedule = Schedule() manager = Machine() manager.status = MachineStatus.manager manager.condor_slot = 'manager' self.machines = [manager] boot_entry = ScheduleEntry(Job('boot', None), manager, self.timestamp, self.timestamp) boot_entry.real_start = self.timestamp boot_entry.real_end = self.timestamp boot_entry.status = EntryStatus.completed self.schedule.add_entry_host(boot_entry, manager) self.local = local if azure_config and not local: hostname = socket.gethostname() self.exp = AzureExperiment(azure_config, skip_setup=skip_setup, name=hostname) self.master_addr = socket.gethostbyname(hostname) self.user = azure_config.admin_username else: self.exp = self.master_addr = self.user = None def add_workflow(self, workflow_dir, prediction_file, budget): self.budget = self.budget + int(round(float(budget))) wf_id = self.workflow.add_workflow(workflow_dir, prediction_file=prediction_file) self.logwatcher.add(wf_id, workflow_dir) def update_schedule(self): print 'UPDATE SCHED' self.update_budget_timestamp() self.last_resched = self.timestamp # completed and running entries will not change self.schedule.rm_scheduled_entries() if self.workflow.has_jobs_to_sched(self.schedule): # Max number of vms nmax = get_nmax(self.workflow, self.machines, self.schedule, self.vm_limit, self.timestamp, self.local) print 'NMAX',nmax # Get the number of machines to be used schedule, _cost, _n = sched_number_of_machines(self.workflow, self.machines, self.schedule, nmax, self.timestamp, self.budget, self.local) print "N", _n, 'budget', self.budget # Update schedule self.schedule = schedule def update_budget_timestamp(self): timestamp = datetime.now() if self.timestamp != None: # Supondo vm_cost em cost/second # Supondo que não houve mudança no número de máquinas # desde o ultimo self.timestamp delta = (timestamp - self.timestamp).seconds charged = delta * len(self.machines) * VM_COST_PER_SEC self.budget = self.budget - charged self.timestamp = timestamp def update_wf_pred(self): self.cost_pred, self.wf_end = sched_cost_pred(self.machines, self.schedule, self.timestamp) def allocate_new_vms(self): # boot entries if self.schedule != None: for m in self.schedule.entries_host.keys(): entry = self.schedule.entries_host[m][0] if entry.status == EntryStatus.scheduled and entry.start() <= self.timestamp: m.allocate(self.exp, self.master_addr, self.user) self.machines.append(m) entry.status = EntryStatus.executing entry.log[LogKey.real_start] = self.timestamp def deallocate_vms(self): for m in self.machines: if m.status == MachineStatus.manager: continue # if there's no more budget or # if there's nothing executing or scheduled to the machine if self.schedule == None or len([e for e in self.schedule.entries_host[m] if e.status != EntryStatus.completed]) == 0: m.deallocate(self.exp) print "--Machine", m.condor_slot # update machine list self.machines = [m for m in self.machines if m.status != MachineStatus.deallocating] def sync_machines(self): slots_addrs = condor_slots() running_machines = [m for m in self.machines if m.status == MachineStatus.running] allocating_machines = [m for m in self.machines if m.status == MachineStatus.allocating] #allocating_machines.sort(key=lambda x: self.schedule.entries_host[x][0].start()) i = 0 for (slot,addr) in slots_addrs: if slot not in [m.condor_slot for m in running_machines]: allocated_machine = None if not self.local: allocated_machine = next((m for m in allocating_machines if m.priv_addr == addr), None) elif len(allocating_machines[i:]) > 0: # update machine allocated_machine = allocating_machines[i] if allocated_machine: allocated_machine.status = MachineStatus.running allocated_machine.condor_slot = slot # update entry boot_entry = self.schedule.entries_host[allocated_machine][0] boot_entry.log[LogKey.real_end] = self.timestamp boot_entry.status = EntryStatus.completed i += 1 print "++Machine", allocated_machine.condor_slot else: if next((e for e in self.schedule.entries if e.host.priv_addr == addr and e.status != EntryStatus.completed), None): print "ERROR: slot not found", slot, addr, 'nr', len(running_machines), 'na', len(allocating_machines) def _handle_log_events(self): jobs_terminated = False log_entries = self.logwatcher.nexts() for le in log_entries: if le.id in self.schedule.entries_cid: sched_entry = self.schedule.entries_cid[le.id] else: sched_entry = next((e for e in self.schedule.entries if e.job.dag_job_id == le.name and e.job.wf_id == le.wf_id), None) if sched_entry: sched_entry.condor_id = le.id self.schedule.add_entry_cid(sched_entry) if sched_entry: sched_entry.log[le.event] = le.timestamp if le.event == LogKey.execute: sched_entry.status = EntryStatus.executing elif le.event == LogKey.job_terminated: sched_entry.status = EntryStatus.completed sched_entry.log[LogKey.real_end] = self.timestamp print "--Job", le.id, sched_entry.job.dag_job_id, sched_entry.host.condor_slot jobs_terminated = True else: print 'could not find sched_entry for:', le.id return jobs_terminated def _handle_ready_jobs(self): need_condor_resched = False idle_cjobs = condor_idle() # idle jobs for cjob in idle_cjobs: condor_id, wf_id, dag_job_id = cjob.split() if condor_id in self.schedule.entries_cid: sched_entry = self.schedule.entries_cid[condor_id] else: sched_entry = next((e for e in self.schedule.entries \ if e.job.dag_job_id == dag_job_id \ and e.job.wf_id == wf_id ), None) if sched_entry: sched_entry.condor_id = condor_id self.schedule.add_entry_cid(sched_entry) if sched_entry and sched_entry.status == EntryStatus.scheduled \ and sched_entry.host.status == MachineStatus.running: sched_entry.status = EntryStatus.executing sched_entry.log[LogKey.real_start] = self.timestamp print "++Job", condor_id, dag_job_id, sched_entry.host.condor_slot condor_qedit(condor_id, wf_id, dag_job_id, sched_entry.host.condor_slot) need_condor_resched = True if need_condor_resched: condor_reschedule() def update_jobs(self): # handle log events and check if any job terminated self.jobs_terminated = self._handle_log_events() or self.jobs_terminated # need to update schedule (?) if self.last_resched and self.jobs_terminated and \ ((self.timestamp - self.last_resched).seconds > SCHED_TIMEOUT): self.update_schedule() self.jobs_terminated = False # handle jobs that are ready to execute self._handle_ready_jobs()