Ejemplo n.º 1
0
 def __init__(self, vm_limit, azure_config, skip_setup, local):
     self.vm_limit = vm_limit # user input
     self.budget = 0
     self.timestamp = datetime.now()
     self.cost_pred = 0
     self.wf_end = None
     
     self.jobs_terminated = False
     self.last_resched = None
     
     self.workflow = Workflow()
     self.logwatcher = LogWatcher()
     
     self.schedule = Schedule()
     
     manager = Machine()
     manager.status = MachineStatus.manager
     manager.condor_slot = 'manager'
     self.machines = [manager]
     
     boot_entry = ScheduleEntry(Job('boot', None), manager, self.timestamp, self.timestamp)
     boot_entry.real_start = self.timestamp
     boot_entry.real_end = self.timestamp
     boot_entry.status = EntryStatus.completed
     self.schedule.add_entry_host(boot_entry, manager)
     
     self.local = local
     if azure_config and not local:
         hostname = socket.gethostname()
         self.exp = AzureExperiment(azure_config, skip_setup=skip_setup, name=hostname)
         self.master_addr = socket.gethostbyname(hostname)
         self.user = azure_config.admin_username
     else:
         self.exp = self.master_addr = self.user = None
Ejemplo n.º 2
0
    def initenviron(logname, maxdrones, debug=False, timeout=90, nanodebug=0, cmadebug=0):
        'Initialize the test environment.'
        logwatch = LogWatcher(logname, [], timeout, returnonlymatch=True, debug=debug)
        logwatch.setwatch()
        sysenv = SystemTestEnvironment(logname, maxdrones, nanodebug=nanodebug, cmadebug=cmadebug)
        CMAinit(None, host=str(sysenv.cma.ipaddr), readonly=True,
                neologin=SystemTestEnvironment.NEO4JLOGIN, neopass=SystemTestEnvironment.NEO4JPASS)
        url = 'http://%s:%d/db/data/' % (sysenv.cma.ipaddr, 7474)
        print >> sys.stderr, 'OPENING Neo4j at URL %s' % url
        neo4j.authenticate('%s:7474' % sysenv.cma.ipaddr,
                           SystemTestEnvironment.NEO4JLOGIN,
                           SystemTestEnvironment.NEO4JPASS)
        store = Store(neo4j.Graph(url), readonly=True)
        for classname in GN.GraphNode.classmap:
            GN.GraphNode.initclasstypeobj(store, classname)

        logger('$(grep MemFree: /proc/meminfo)', hardquote=False)
        tq = QueryTest(store
        ,   '''START drone=node:Drone('*:*') WHERE drone.status = "up" RETURN drone'''
        ,   GN.nodeconstructor, debug=debug)

        if not tq.check([None,], minrows=maxdrones+1, maxrows=maxdrones+1
            ,   delay=0.5, maxtries=20):
            sysenv.cma.cleanupwhendone = False
            raise RuntimeError('Query of "up" status failed. Weirdness')
        return sysenv, store
Ejemplo n.º 3
0
    def run(self, nano=None, debug=None, timeout=240):
        'Actually start the nanoprobe and see if it worked'
        if debug is None:
            debug = self.debug
        if nano is None:
            nanozero = self.testenviron.select_nano_noservice()
            if len(nanozero) > 0:
                nano = nanozero[0]
        if (nano is None or nano.status != TestSystem.RUNNING
                or SystemTestEnvironment.NANOSERVICE in nano.runningservices):
            return self._record(AssimSysTest.SKIPPED)

        regexes = self.nano_start_regexes(nano)

        watch = LogWatcher(self.logfilename,
                           regexes,
                           timeout=timeout,
                           debug=debug)
        watch.setwatch()
        qstr = (
            '''START drone=node:Drone('*:*') '''
            '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" '''
            '''RETURN drone''')
        nano.startservice(SystemTestEnvironment.NANOSERVICE)
        return self.checkresults(watch, timeout, qstr, None, nano)
    def initenviron(logname,
                    maxdrones,
                    mgmtsystem,
                    debug=False,
                    cmaimage='',
                    nanoimages=[],
                    timeout=90,
                    nanodebug=0,
                    cmadebug=0):
        'Initialize the test environment.'
        logwatch = LogWatcher(logname, [],
                              timeout,
                              returnonlymatch=True,
                              debug=debug)
        logwatch.setwatch()

        sysenv = SystemTestEnvironment(logname,
                                       maxdrones,
                                       mgmtsystem,
                                       cmaimage=cmaimage,
                                       nanoimages=nanoimages,
                                       nanodebug=nanodebug,
                                       cmadebug=cmadebug)
        CMAinit(None,
                host=str(sysenv.cma.ipaddr),
                readonly=True,
                neologin=SystemTestEnvironment.NEO4JLOGIN,
                neopass=SystemTestEnvironment.NEO4JPASS)
        url = 'http://%s:%d/db/data/' % (sysenv.cma.ipaddr, 7474)
        print >> sys.stderr, 'OPENING Neo4j at URL %s' % url
        neo4j.authenticate('%s:7474' % sysenv.cma.ipaddr,
                           SystemTestEnvironment.NEO4JLOGIN,
                           SystemTestEnvironment.NEO4JPASS)
        store = Store(neo4j.Graph(url), readonly=True)
        for classname in GN.GraphNode.classmap:
            GN.GraphNode.initclasstypeobj(store, classname)

        logger('$(grep MemFree: /proc/meminfo)', hardquote=False)
        tq = QueryTest(
            store,
            '''START drone=node:Drone('*:*') WHERE drone.status = "up" RETURN drone''',
            GN.nodeconstructor,
            debug=debug)

        if not tq.check([
                None,
        ],
                        minrows=maxdrones + 1,
                        maxrows=maxdrones + 1,
                        delay=0.5,
                        maxtries=20):
            sysenv.cma.cleanupwhendone = False
            raise RuntimeError('Query of "up" status failed. Weirdness')
        return sysenv, store
Ejemplo n.º 5
0
    def testmain(logname, maxdrones=3, debug=False):
        'Test our test cases'
        logger('Starting test of our test cases')
        try:
            sysenv, ourstore = AssimSysTest.initenviron(logname,
                                                        maxdrones,
                                                        debug,
                                                        cmadebug=5,
                                                        nanodebug=3)
        except AssertionError:
            print 'FAILED initial startup - which is pretty basic'
            print 'Any chance you have another CMA running??'
            raise RuntimeError('Another CMA is running(?)')

        badregexes = (
            ' ERROR: ',
            ' CRIT: ',
            ' CRITICAL: '
            # 'HBDEAD'
            #,   r'Peer at address .* is dead'
            ,
            r'OUTALLDONE .* while in state NONE')
        #for cls in [SimulCMAandNanoprobeRestart for j in range(0,20)]:
        #for j in range(0,10):
        #for cls in [DiscoverService for j in range(0,100)]:
        for cls in AssimSysTest.testset:
            badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0)
            logger('CREATED LOG WATCH with %s' % str(badregexes))
            badwatch.setwatch()
            logger('Starting test %s' % (cls.__name__))
            if cls is DiscoverService:
                ret = cls(ourstore,
                          logname,
                          sysenv,
                          debug=debug,
                          service='ssh',
                          monitorname='check_ssh').run()
            else:
                ret = cls(ourstore, logname, sysenv, debug=debug).run()
            #print >> sys.stderr, 'Got return of %s from test %s' % (ret, cls.__name__)
            badmatch = badwatch.look(timeout=1)
            if badmatch is not None:
                print 'OOPS! Got bad results!', badmatch
                raise RuntimeError('Test %s said bad words! [%s]' %
                                   (cls.__name__, badmatch))
            assert ret == AssimSysTest.SUCCESS or ret == AssimSysTest.SKIPPED
            #assert ret == AssimSysTest.SUCCESS
        logger('WOOT! All tests were successful!')
Ejemplo n.º 6
0
 def testmain(logname, maxdrones=25, debug=False):
     "A simple test main program"
     regexes = []
     # pylint says: [W0612:testmain] Unused variable 'j'
     # pylint: disable=W0612
     for j in range(0, maxdrones + 1):
         regexes.append("Stored packages JSON data from *([^ ]*) ")
     logwatch = LogWatcher(logname, regexes, timeout=90, returnonlymatch=True)
     logwatch.setwatch()
     sysenv = SystemTestEnvironment(maxdrones)
     print >> sys.stderr, "Systems all up and running."
     url = "http://%s:%d/db/data/" % (sysenv.cma.ipaddr, 7474)
     CMAinit(None)
     store = Store(neo4j.GraphDatabaseService(url), readonly=True)
     for classname in GN.GraphNode.classmap:
         GN.GraphNode.initclasstypeobj(store, classname)
     results = logwatch.lookforall()
     if debug:
         print >> sys.stderr, "WATCH RESULTS:", results
     tq = QueryTest(store, "START drone=node:Drone('*:*') RETURN drone", GN.nodeconstructor, debug=debug)
     print >> sys.stderr, "Running Query"
     if tq.check([None], minrows=maxdrones + 1, maxrows=maxdrones + 1):
         print "WOOT! Systems passed query check after initial startup!"
     else:
         print "Systems FAILED initial startup query check"
         print "Do you have a second CMA running??"
         print "Rerunning query with debug=True"
         tq.debug = True
         tq.check([None], minrows=maxdrones + 1, maxrows=maxdrones + 1)
         return 1
     cma = sysenv.cma
     nano = sysenv.nanoprobes[0]
     regex = r"%s cma INFO: System %s at \[::ffff:%s]:1984 reports graceful shutdown" % (
         cma.hostname,
         nano.hostname,
         nano.ipaddr,
     )
     # print >> sys.stderr, 'REGEX IS: [%s]' % regex
     logwatch = LogWatcher(logname, [regex], timeout=30, returnonlymatch=False)
     logwatch.setwatch()
     nano.stopservice(SystemTestEnvironment.NANOSERVICE)
     logwatch.look()
     time.sleep(30)
     tq = QueryTest(
         store,
         ("""START drone=node:Drone('*:*') """ """WHERE drone.designation = "{0.hostname}" RETURN drone"""),
         GN.nodeconstructor,
         debug=debug,
     )
     if tq.check([nano], downbyshutdown, maxrows=1):
         print "WOOT! Systems passed query check after nano shutdown!"
     else:
         print "Systems FAILED query check after nano shutdown"
Ejemplo n.º 7
0
 def run(self, nano=None, debug=None, timeout=60):
     'Actually stop and start (restart) the CMA and see if it worked'
     if debug is None:
         debug = self.debug
     cma = self.testenviron.cma
     cma.stopservice(SystemTestEnvironment.CMASERVICE)
     regexes = self.cma_start_regexes()
     watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug)
     watch.setwatch()
     if self.delay > 0:
         time.sleep(self.delay)
     cma.startservice(SystemTestEnvironment.CMASERVICE)
     # This just makes sure the database is still up - which it should be...
     # Once we receive the CMA update message, we really should already be good to go
     qstr =  '''START one=node(*) RETURN one LIMIT 1'''
     return self.checkresults(watch, timeout, qstr, None, nano)
Ejemplo n.º 8
0
    def __init__(self):
        self.workflow = Workflow()
        self.creation_timestamp = self.timestamp = datetime.now()
        self.logwatcher = LogWatcher()

        manager = Machine()
        manager.status = MachineStatus.manager
        manager.condor_slot = 'local'
        self.machines = [manager]

        boot_entry = ScheduleEntry(Job('boot', None), manager, None, None)
        boot_entry.real_start = self.timestamp
        boot_entry.real_end = self.timestamp
        boot_entry.status = EntryStatus.completed
        self.entries = [boot_entry]
        self.entries_cid = {}
Ejemplo n.º 9
0
def perform_tests(testset, sysenv, store, itermax, logname, debug=False):
    'Actually perform the given set of tests the given number of times, etc'
    badregexes = (
        r' (ERROR:|CRIT:|CRITICAL:|nanoprobe\[[0-9]*]: segfault at|'
        #r'Peer at address .* is dead|'
        r'OUTALLDONE .* while in state NONE'
        r')', )
    itercount = 1
    while True:
        test = random.choice(testset)
        badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0)
        logit("STARTING test %d - %s" % (itercount, test.__name__))
        os.system('logger -s "Load Avg: $(cat /proc/loadavg)"')
        os.system('logger -s "$(grep MemFree: /proc/meminfo)"')
        badwatch.setwatch()
        if test.__name__ == 'DiscoverService':
            testobj = test(store,
                           logname,
                           sysenv,
                           debug=debug,
                           service='ssh',
                           monitorname='check_ssh')
        else:
            testobj = test(store, logname, sysenv, debug=debug)
        ret = testobj.run()
        match = badwatch.look()
        if match is not None:
            logit('BAD MESSAGE from Test %d %s: %s' %
                  (itercount, test.__name__, match))
            testobj.replace_result(AssimSysTest.FAIL)
            ret = AssimSysTest.FAIL
        if ret == AssimSysTest.SUCCESS:
            logit('Test %d %s succeeded!' % (itercount, test.__name__))
            itercount += 1
        elif ret == AssimSysTest.FAIL:
            logit('Test %d %s FAILED :-(' % (itercount, test.__name__))
            itercount += 1
        elif ret == AssimSysTest.SKIPPED:
            logit('Test %d %s skipped' % (itercount, test.__name__))
        else:
            logit('Test %d %s RETURNED SOMETHING REALLY WEIRD [%s]' %
                  (itercount, test.__name__, str(ret)))
            testobj.replace_result(AssimSysTest.FAIL)
        print ''
        if itercount > itermax:
            break
    return summarize_tests()
 def run(self, nano=None, debug=None, timeout=60):
     'Actually stop and start (restart) the CMA and see if it worked'
     if debug is None:
         debug = self.debug
     cma = self.testenviron.cma
     cma.stopservice(SystemTestEnvironment.CMASERVICE)
     regex = (' %s .* INFO: Neo4j version .* // py2neo version .*'
             ' // Python version .* // java version.*') % cma.hostname
     watch = LogWatcher(self.logfilename, (regex,), timeout=timeout, debug=debug)
     watch.setwatch()
     if self.delay > 0:
         time.sleep(self.delay)
     cma.startservice(SystemTestEnvironment.CMASERVICE)
     # This just makes sure the database is still up - which it should be...
     # Once we receive the CMA update message, we really should already be good to go
     qstr =  '''START one=node(*) RETURN one LIMIT 1'''
     return self.checkresults(watch, timeout, qstr, None, nano)
Ejemplo n.º 11
0
 def __init__(self, logname, nanocount=10
 ,       cmaimage='assimilation/build-wily', nanoimages=('assimilation/build-wily',)
 #,       cmaimage='3f06b7c84030', nanoimages=('3f06b7c84030',)
 ,       sysclass=DockerSystem, cleanupwhendone=False, nanodebug=0, cmadebug=0, chunksize=20):
     'Init/constructor for our SystemTestEnvironment'
     self.sysclass = sysclass
     self.cmaimage = cmaimage
     self.nanoimages = nanoimages
     self.nanoprobes = []
     self.cma = None
     self.debug = 0
     self.cleanupwhendone = cleanupwhendone
     self.logname = logname
     watch = LogWatcher(logname, [])
     watch.setwatch()
     nanodebug=1
     cmadebug=2
     self.nanodebug = nanodebug
     self.cmadebug = nanodebug
     self.spawncma(nanodebug=nanodebug, cmadebug=cmadebug)
     regex = (' %s .* INFO: Neo4j version .* // py2neo version .*'
             ' // Python version .* // (java|openjdk) version.*') % self.cma.hostname
     watch.setregexes((regex,))
     if watch.lookforall(timeout=120) is None:
         os.system("logger -s 'CMA did not start!! [[%s]]'" % (regex))
         print >> sys.stderr, 'CMA did not start!! %s' % regex
         raise RuntimeError('CMA did not start: %s' % regex)
     print >> sys.stderr, 'nanocount is', nanocount
     print >> sys.stderr, 'self.nanoimages is', self.nanoimages
     # We do this in chunks to manage stress on our test environment
     children_left = range(0, nanocount)
     while (len(children_left) > 0):
         self._create_nano_chunk(children_left[0:chunksize])
         del children_left[0:chunksize]
Ejemplo n.º 12
0
 def run(self, nano=None, debug=None, timeout=180):
     'Actually stop the nanoprobe and see if it worked'
     if debug is None:
         debug = self.debug
     if nano is None:
         nanozero = self.testenviron.select_nano_service()
         if len(nanozero) > 0:
             nano = nanozero[0]
     if (nano is None or nano.status != TestSystem.RUNNING or
         SystemTestEnvironment.NANOSERVICE not in nano.runningservices):
         return self._record(AssimSysTest.SKIPPED)
     regexes = self.nano_stop_regexes(nano)
     watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug)
     watch.setwatch()
     qstr =  (   '''START drone=node:Drone('*:*') '''
                  '''WHERE drone.designation = "{0.hostname}" and drone.status = "dead" '''
                  '''and drone.reason = "HBSHUTDOWN"       RETURN drone''')
     nano.stopservice(SystemTestEnvironment.NANOSERVICE)
     return self.checkresults(watch, timeout, qstr, None, nano)
Ejemplo n.º 13
0
def perform_tests(testset, sysenv, store, itermax, logname, debug=False):
    'Actually perform the given set of tests the given number of times, etc'
    badregexes=(r' (ERROR:|CRIT:|CRITICAL:|nanoprobe\[[0-9]*]: segfault at|'
            #r'Peer at address .* is dead|'
            r'OUTALLDONE .* while in state NONE'
            r')',)
    itercount=1
    while True:
        test = random.choice(testset)
        badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0)
        logit("STARTING test %d - %s" %   (itercount, test.__name__))
        os.system('logger -s "Load Avg: $(cat /proc/loadavg)"')
        os.system('logger -s "$(grep MemFree: /proc/meminfo)"')
        badwatch.setwatch()
        if test.__name__ == 'DiscoverService':
            testobj = test(store, logname, sysenv, debug=debug
            ,       service='ssh', monitorname='check_ssh')
        else:
            testobj = test(store, logname, sysenv, debug=debug)
        ret = testobj.run()
        match = badwatch.look()
        if match is not None:
            logit('BAD MESSAGE from Test %d %s: %s' % (itercount, test.__name__, match))
            testobj.replace_result(AssimSysTest.FAIL)
            ret = AssimSysTest.FAIL
        if ret == AssimSysTest.SUCCESS:
            logit('Test %d %s succeeded!' % (itercount, test.__name__))
            itercount += 1
        elif ret == AssimSysTest.FAIL:
            logit('Test %d %s FAILED :-(' % (itercount, test.__name__))
            itercount += 1
        elif ret == AssimSysTest.SKIPPED:
            logit('Test %d %s skipped' % (itercount, test.__name__))
        else:
            logit('Test %d %s RETURNED SOMETHING REALLY WEIRD [%s]'
            %   (itercount, test.__name__, str(ret)))
            testobj.replace_result(AssimSysTest.FAIL)
        print ''
        if itercount > itermax:
            break
    return summarize_tests()
Ejemplo n.º 14
0
 def __init__(self, logname, nanocount=10
 ,       cmaimage='assimilation/build-utopic', nanoimages=('assimilation/build-utopic',)
 ,       sysclass=DockerSystem, cleanupwhendone=True, nanodebug=0, cmadebug=0, chunksize=20):
     'Init/constructor for our SystemTestEnvironment'
     self.sysclass = sysclass
     self.cmaimage = cmaimage
     self.nanoimages = nanoimages
     self.nanoprobes = []
     self.cma = None
     self.debug = 0
     self.cleanupwhendone = cleanupwhendone
     self.logname = logname
     watch = LogWatcher(logname, [])
     watch.setwatch()
     nanodebug=1
     cmadebug=0
     self.nanodebug = nanodebug
     self.cmadebug = nanodebug
     self.spawncma(nanodebug=nanodebug, cmadebug=cmadebug)
     regex = (' %s .* INFO: Neo4j version .* // py2neo version .*'
             ' // Python version .* // java version.*') % self.cma.hostname
     watch.setregexes((regex,))
     if watch.lookforall(timeout=60) is None:
         print >> sys.stderr, 'CMA did not start!!'
         raise RuntimeError('CMA did not start')
     print >> sys.stderr, 'nanocount is', nanocount
     print >> sys.stderr, 'self.nanoimages is', self.nanoimages
     # We do this in chunks to manage stress on our test environment
     children_left = range(0, nanocount)
     while (len(children_left) > 0):
         self._create_nano_chunk(children_left[0:chunksize])
         del children_left[0:chunksize]
Ejemplo n.º 15
0
    def testmain(logname, maxdrones=3, debug=False):
        'Test our test cases'
        logger('Starting test of our test cases')
        try:
            sysenv, ourstore = AssimSysTest.initenviron(logname, maxdrones, debug
            ,       cmadebug=5, nanodebug=3)
        except AssertionError:
            print 'FAILED initial startup - which is pretty basic'
            print 'Any chance you have another CMA running??'
            raise RuntimeError('Another CMA is running(?)')

        badregexes=(' ERROR: ', ' CRIT: ', ' CRITICAL: '
        # 'HBDEAD'
        #,   r'Peer at address .* is dead'
        ,   r'OUTALLDONE .* while in state NONE'
        )
        #for cls in [SimulCMAandNanoprobeRestart for j in range(0,20)]:
        #for j in range(0,10):
        #for cls in [DiscoverService for j in range(0,100)]:
        for cls in AssimSysTest.testset:
            badwatch = LogWatcher(logname, badregexes, timeout=1, debug=0)
            logger('CREATED LOG WATCH with %s' % str(badregexes))
            badwatch.setwatch()
            logger('Starting test %s' %   (cls.__name__))
            if cls is DiscoverService:
                ret = cls(ourstore, logname, sysenv, debug=debug
                ,       service='ssh', monitorname='check_ssh').run()
            else:
                ret = cls(ourstore, logname, sysenv, debug=debug).run()
            #print >> sys.stderr, 'Got return of %s from test %s' % (ret, cls.__name__)
            badmatch = badwatch.look(timeout=1)
            if badmatch is not None:
                print 'OOPS! Got bad results!', badmatch
                raise RuntimeError('Test %s said bad words! [%s]' % (cls.__name__, badmatch))
            assert ret == AssimSysTest.SUCCESS or ret == AssimSysTest.SKIPPED
            #assert ret == AssimSysTest.SUCCESS
        logger('WOOT! All tests were successful!')
Ejemplo n.º 16
0
 def __init__(self):
     self.workflow = Workflow()
     self.creation_timestamp = self.timestamp = datetime.now()
     self.logwatcher = LogWatcher()
     
     manager = Machine()
     manager.status = MachineStatus.manager
     manager.condor_slot = 'local'
     self.machines = [manager]
     
     boot_entry = ScheduleEntry(Job('boot', None), manager, None, None)
     boot_entry.real_start = self.timestamp
     boot_entry.real_end = self.timestamp
     boot_entry.status = EntryStatus.completed
     self.entries = [boot_entry]
     self.entries_cid = {}
Ejemplo n.º 17
0
 def run(self, nano=None, debug=None, timeout=300):
     '''Our default timeout is so long because we can take a while to give up shutting down
     the nanoprobe - an ACK timeout might have to occur before it can shut down.
     '''
     if debug is None:
         debug = self.debug
     if nano is None:
         nanozero = self.testenviron.select_nano_service()
         if len(nanozero) < 1:
             return self._record(AssimSysTest.SKIPPED)
     nano = nanozero[0]
     cma = self.testenviron.cma
     regexes = self.nano_stop_regexes(nano)
     regexes.extend(self.cma_stop_regexes())
     regexes.extend(self.cma_start_regexes())
     watch = LogWatcher(self.logfilename,
                        regexes,
                        timeout=timeout,
                        debug=debug)
     watch.setwatch()
     cma.stopservice(SystemTestEnvironment.CMASERVICE)
     nano.stopservice(SystemTestEnvironment.NANOSERVICE, async=True)
     cma.startservice(SystemTestEnvironment.CMASERVICE)
     if self.delay > 0:
         time.sleep(self.delay)
     qstr = (
         '''START drone=node:Drone('*:*') '''
         '''WHERE drone.designation = "{0.hostname}" and drone.status = "dead" '''
         '''and drone.reason = "HBSHUTDOWN"       RETURN drone''')
     rc = self.checkresults(watch, timeout, qstr, None, nano)
     if rc != AssimSysTest.SUCCESS:
         return rc
     # We have to do this in two parts because of the asynchronous shutdown above
     regexes = self.nano_start_regexes(nano)
     watch = LogWatcher(self.logfilename,
                        regexes,
                        timeout=timeout,
                        debug=debug)
     watch.setwatch()
     nano.startservice(SystemTestEnvironment.NANOSERVICE)
     qstr = (
         '''START drone=node:Drone('*:*') '''
         '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" '''
         '''RETURN drone''')
     return self.checkresults(watch, timeout, qstr, None, nano)
Ejemplo n.º 18
0
 def _create_nano_chunk(self, childnos):
     'Create a chunk of nanoprobes'
     watch = LogWatcher(self.logname, [])
     watch.setwatch()
     regexes = []
     for childcount in childnos:
         childcount = childcount  # Make pylint happy...
         nano = self.spawnnanoprobe(debug=self.nanodebug)
         regexes.extend([
             r' (%s) nanoprobe\[.*]: NOTICE: Connected to CMA.  Happiness :-D'
             % (nano.hostname),
             r' %s cma INFO: Drone %s registered from address \[::ffff:%s]'
             % (self.cma.hostname, nano.hostname, nano.ipaddr),
             r' %s cma INFO: Processed tcpdiscovery JSON data from (%s) into graph.'
             % (self.cma.hostname, nano.hostname),
         ])
         self.nanoprobes.append(nano)
     watch.setregexes(regexes)
     if watch.lookforall(timeout=30) is None:
         raise RuntimeError(
             'Nanoprobes did not start [%s, %s] - missing %s' %
             (nano.hostname, nano.ipaddr, str(watch.unmatched)))
Ejemplo n.º 19
0
 def _create_nano_chunk(self, childnos):
     'Create a chunk of nanoprobes'
     watch = LogWatcher(self.logname, [], debug=0)
     watch.setwatch()
     regexes = []
     for childcount in childnos:
         childcount = childcount # Make pylint happy...
         nano = self.spawnnanoprobe(debug=self.nanodebug)
         regexes .extend([
             r' %s nanoprobe\[.*]: NOTICE: Connected to CMA.  Happiness :-D'
             %   (nano.hostname),
             r' %s cma INFO: Drone %s registered from address \[::ffff:%s]'
             %           (self.cma.hostname, nano.hostname, nano.ipaddr),
             r' %s cma INFO: Processed u?n?changed tcpdiscovery'
             r' JSON data from %s into graph.'
             %       (self.cma.hostname, nano.hostname),
         ])
         self.nanoprobes.append(nano)
     print >> sys.stderr, len(regexes), 'NANOPROBE REGEXES ARE:', (regexes)
     watch.setregexes(regexes)
     if watch.lookforall(timeout=120) is None:
         raise RuntimeError('Nanoprobes did not start - missing %s'
         %   (str(watch.unmatched)))
Ejemplo n.º 20
0
 def _create_nano_chunk(self, childnos):
     'Create a chunk of nanoprobes'
     watch = LogWatcher(self.logname, [])
     watch.setwatch()
     regexes = []
     for childcount in childnos:
         childcount = childcount # Make pylint happy...
         nano = self.spawnnanoprobe(debug=self.nanodebug)
         regexes .extend([
             r' (%s) nanoprobe\[.*]: NOTICE: Connected to CMA.  Happiness :-D'
             %   (nano.hostname),
             r' %s cma INFO: Drone %s registered from address \[::ffff:%s]'
             %           (self.cma.hostname, nano.hostname, nano.ipaddr),
             r' %s cma INFO: Processed tcpdiscovery JSON data from (%s) into graph.'
             %       (self.cma.hostname, nano.hostname),
         ])
         self.nanoprobes.append(nano)
     watch.setregexes(regexes)
     if watch.lookforall(timeout=30) is None:
         raise RuntimeError('Nanoprobes did not start [%s, %s] - missing %s'
         %   (nano.hostname, nano.ipaddr, str(watch.unmatched)))
Ejemplo n.º 21
0
 def run(self, nano=None, debug=None, timeout=300):
     '''Our default timeout is so long because we can take a while to give up shutting down
     the nanoprobe - an ACK timeout might have to occur before it can shut down.
     '''
     if debug is None:
         debug = self.debug
     if nano is None:
         nanozero = self.testenviron.select_nano_service()
         if len(nanozero) < 1:
             return self._record(AssimSysTest.SKIPPED)
     nano = nanozero[0]
     cma = self.testenviron.cma
     regexes = self.nano_stop_regexes(nano)
     regexes.extend(self.cma_stop_regexes())
     regexes.extend(self.cma_start_regexes())
     watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug)
     watch.setwatch()
     cma.stopservice(SystemTestEnvironment.CMASERVICE)
     nano.stopservice(SystemTestEnvironment.NANOSERVICE, async=True)
     cma.startservice(SystemTestEnvironment.CMASERVICE)
     if self.delay > 0:
         time.sleep(self.delay)
     qstr =  (   '''START drone=node:Drone('*:*') '''
                  '''WHERE drone.designation = "{0.hostname}" and drone.status = "dead" '''
                  '''and drone.reason = "HBSHUTDOWN"       RETURN drone''')
     rc = self.checkresults(watch, timeout, qstr, None, nano)
     if rc != AssimSysTest.SUCCESS:
         return rc
     # We have to do this in two parts because of the asynchronous shutdown above
     regexes = self.nano_start_regexes(nano)
     watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug)
     watch.setwatch()
     nano.startservice(SystemTestEnvironment.NANOSERVICE)
     qstr = (    '''START drone=node:Drone('*:*') '''
                  '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" '''
                  '''RETURN drone''')
     return self.checkresults(watch, timeout, qstr, None, nano)
Ejemplo n.º 22
0
#!/usr/bin/python

from pymongo import MongoClient
from logwatcher import LogWatcher
import datetime
import json

header = ["time", "duration", "client_addr", "result_code", "bytes", "req_method", "URL", "user", "hier_code", "type"]
COL = len(header)

client = MongoClient('localhost', 27017)
db = client.squid
accesslog = db.accesslog

def callback(filename, lines):
    for line in lines:
	rec = list(line.split())
	if len(rec) != COL : continue
        record = dict(zip(header, rec))
	# print json.dumps(record, indent=2)
	result = accesslog.insert_one(record)
	print result.inserted_id

lw = LogWatcher("/var/log/squid3", callback)
lw.loop()
Ejemplo n.º 23
0
class Monitor():
    def __init__(self):
        self.workflow = Workflow()
        self.creation_timestamp = self.timestamp = datetime.now()
        self.logwatcher = LogWatcher()

        manager = Machine()
        manager.status = MachineStatus.manager
        manager.condor_slot = 'local'
        self.machines = [manager]

        boot_entry = ScheduleEntry(Job('boot', None), manager, None, None)
        boot_entry.real_start = self.timestamp
        boot_entry.real_end = self.timestamp
        boot_entry.status = EntryStatus.completed
        self.entries = [boot_entry]
        self.entries_cid = {}

    def add_workflow(self, workflow_dir):
        wf_id = self.workflow.add_workflow(workflow_dir)
        self.logwatcher.add(wf_id, workflow_dir)

    def sync_machines(self):
        slots = condor_slots()
        for s in slots:
            if s not in [m.condor_slot for m in self.machines]:
                machine = Machine()
                machine.status = MachineStatus.running
                machine.condor_slot = s
                boot_job = Job('boot', None)
                boot_entry = ScheduleEntry(boot_job, machine, None, None)
                boot_entry.log[LogKey.real_start] = self.creation_timestamp
                boot_entry.log[LogKey.real_end] = self.timestamp
                boot_entry.status = EntryStatus.completed
                self.entries.append(boot_entry)
                self.machines.append(machine)
                print "++Machine", s

    def sync_jobs(self):
        log_entries = self.logwatcher.nexts()
        for le in log_entries:
            if le.id in self.entries_cid:  # in dict keys
                entry = self.entries_cid[le.id]
            else:
                entry = ScheduleEntry(condor_id=le.id)
                self.entries.append(entry)
                self.entries_cid[le.id] = entry
                print "++Job", le.id

            entry.log[le.event] = le.timestamp

            if le.event == LogKey.execute:
                entry.status = EntryStatus.executing
            elif le.event == LogKey.job_terminated:
                entry.status = EntryStatus.completed
                wf_id, dag_job_id, slot = condor_history(le.id)

                job = next(
                    (j for j in self.workflow.jobs
                     if j.dag_job_id == dag_job_id and j.wf_id == wf_id), None)
                if job:
                    entry.job = job
                    entry.host = next(
                        (m for m in self.machines if m.condor_slot == slot),
                        self.machines[0])
                    print "--Job", le.id, dag_job_id, entry.host.condor_slot

    def update_timestamp(self):
        self.timestamp = datetime.now()
Ejemplo n.º 24
0
 def run(self, nano=None, debug=None, timeout=240, service=None, monitorname=None):
     if debug is None:
         debug = self.debug
     if service is None:
         service = self.service
     if monitorname is None:
         monitorname = self.monitorname
     if nano is None:
         nanozero = self.testenviron.select_nano_noservice(service=service)
         if nanozero is None or len(nanozero) < 1:
             return self._record(AssimSysTest.SKIPPED)
         else:
             nano = nanozero[0]
     assert service not in nano.runningservices
     if SystemTestEnvironment.NANOSERVICE not in nano.runningservices:
         startregexes = self.nano_start_regexes(nano)
         watch = LogWatcher(self.logfilename, startregexes, timeout=timeout, debug=debug)
         watch.setwatch()
         nano.startservice(SystemTestEnvironment.NANOSERVICE)
         match = watch.look(timeout=timeout)
         if match is None:
             logger('ERROR: Test %s timed out waiting for any of %s [timeout:%s]'
             %   (self.__class__.__name__, str(watch.regexes), timeout))
             return self._record(AssimSysTest.FAIL)
     regexes = self.nano_stop_regexes(nano)
     watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug)
     watch.setwatch()
     nano.stopservice(SystemTestEnvironment.NANOSERVICE)
     if watch.lookforall(timeout=timeout) is None:
         logger('ERROR: Test %s timed out waiting for all of %s [timeout:%s]'
         %   (self.__class__.__name__, str(watch.unmatched), timeout))
         return self._record(AssimSysTest.FAIL)
     regexes = self.nano_start_regexes(nano)
     regexes.extend(self.nano_startmonitor_regexes(nano, monitorname))
     regexes.extend(self.nano_service_start_regexes(nano, monitorname))
     watch = LogWatcher(self.logfilename, regexes, timeout=timeout, debug=debug)
     watch.setwatch()
     nano.startservice(service)
     nano.startservice(SystemTestEnvironment.NANOSERVICE)
     if watch.lookforall(timeout=timeout) is None:
         logger('ERROR: Test %s timed out waiting for all of %s [timeout:%s]'
         %   (self.__class__.__name__, str(watch.unmatched), timeout))
         return self._record(AssimSysTest.FAIL)
     # @TODO make a better query
     # but it should be enough to let us validate the rest
     qstr = (    '''START drone=node:Drone('*:*') '''
                  '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" '''
                  '''RETURN drone''')
     return self.checkresults(watch, timeout, qstr, None, nano, debug=debug)
Ejemplo n.º 25
0
class Monitor():
    def __init__(self):
        self.workflow = Workflow()
        self.creation_timestamp = self.timestamp = datetime.now()
        self.logwatcher = LogWatcher()
        
        manager = Machine()
        manager.status = MachineStatus.manager
        manager.condor_slot = 'local'
        self.machines = [manager]
        
        boot_entry = ScheduleEntry(Job('boot', None), manager, None, None)
        boot_entry.real_start = self.timestamp
        boot_entry.real_end = self.timestamp
        boot_entry.status = EntryStatus.completed
        self.entries = [boot_entry]
        self.entries_cid = {}
        
    def add_workflow(self, workflow_dir):
        wf_id = self.workflow.add_workflow(workflow_dir)
        self.logwatcher.add(wf_id, workflow_dir)
            
    def sync_machines(self):
        slots = condor_slots()
        for s in slots:
            if s not in [m.condor_slot for m in self.machines]:
                machine = Machine()
                machine.status = MachineStatus.running
                machine.condor_slot = s
                boot_job = Job('boot', None)
                boot_entry = ScheduleEntry(boot_job, machine, None, None)
                boot_entry.log[LogKey.real_start] = self.creation_timestamp
                boot_entry.log[LogKey.real_end] = self.timestamp
                boot_entry.status = EntryStatus.completed
                self.entries.append(boot_entry)
                self.machines.append(machine)
                print "++Machine", s
                
    def sync_jobs(self):
        log_entries = self.logwatcher.nexts()
        for le in log_entries:
            if le.id in self.entries_cid: # in dict keys
                entry = self.entries_cid[le.id]
            else:
                entry = ScheduleEntry(condor_id=le.id)
                self.entries.append(entry)
                self.entries_cid[le.id] = entry
                print "++Job", le.id
                
            entry.log[le.event] = le.timestamp
            
            if le.event == LogKey.execute:
                entry.status = EntryStatus.executing
            elif le.event == LogKey.job_terminated:
                entry.status = EntryStatus.completed
                wf_id, dag_job_id, slot = condor_history(le.id)
                
                job = next((j for j in self.workflow.jobs if j.dag_job_id == dag_job_id and j.wf_id == wf_id), None)
                if job:
                    entry.job = job
                    entry.host = next((m for m in self.machines if m.condor_slot == slot), self.machines[0])
                    print "--Job", le.id, dag_job_id, entry.host.condor_slot
            
    def update_timestamp(self):
        self.timestamp = datetime.now()
Ejemplo n.º 26
0
#!/usr/bin/python

from pymongo import MongoClient
from logwatcher import LogWatcher
import datetime
import json

header = [
    "time", "duration", "client_addr", "result_code", "bytes", "req_method",
    "URL", "user", "hier_code", "type"
]
COL = len(header)

client = MongoClient('localhost', 27017)
db = client.squid
accesslog = db.accesslog


def callback(filename, lines):
    for line in lines:
        rec = list(line.split())
        if len(rec) != COL: continue
        record = dict(zip(header, rec))
        # print json.dumps(record, indent=2)
        result = accesslog.insert_one(record)
        print result.inserted_id


lw = LogWatcher("/var/log/squid3", callback)
lw.loop()
Ejemplo n.º 27
0
 def testmain(logname, maxdrones=25, debug=False):
     'A simple test main program'
     regexes = []
     #pylint says: [W0612:testmain] Unused variable 'j'
     #pylint: disable=W0612
     for j in range(0, maxdrones + 1):
         regexes.append('Stored packages JSON data from *([^ ]*) ')
     logwatch = LogWatcher(logname,
                           regexes,
                           timeout=90,
                           returnonlymatch=True)
     logwatch.setwatch()
     sysenv = SystemTestEnvironment(maxdrones)
     print >> sys.stderr, 'Systems all up and running.'
     url = ('http://%s:%d/db/data/' % (sysenv.cma.ipaddr, 7474))
     CMAinit(None)
     store = Store(neo4j.Graph(url), readonly=True)
     for classname in GN.GraphNode.classmap:
         GN.GraphNode.initclasstypeobj(store, classname)
     results = logwatch.lookforall()
     if debug:
         print >> sys.stderr, 'WATCH RESULTS:', results
     tq = QueryTest(store,
                    "START drone=node:Drone('*:*') RETURN drone",
                    GN.nodeconstructor,
                    debug=debug)
     print >> sys.stderr, 'Running Query'
     if tq.check([
             None,
     ], minrows=maxdrones + 1, maxrows=maxdrones + 1):
         print 'WOOT! Systems passed query check after initial startup!'
     else:
         print 'Systems FAILED initial startup query check'
         print 'Do you have a second CMA running??'
         print 'Rerunning query with debug=True'
         tq.debug = True
         tq.check([
             None,
         ], minrows=maxdrones + 1, maxrows=maxdrones + 1)
         return 1
     cma = sysenv.cma
     nano = sysenv.nanoprobes[0]
     regex = (
         r'%s cma INFO: System %s at \[::ffff:%s]:1984 reports graceful shutdown'
         % (cma.hostname, nano.hostname, nano.ipaddr))
     #print >> sys.stderr, 'REGEX IS: [%s]' % regex
     logwatch = LogWatcher(logname, [
         regex,
     ],
                           timeout=30,
                           returnonlymatch=False)
     logwatch.setwatch()
     nano.stopservice(SystemTestEnvironment.NANOSERVICE)
     logwatch.look()
     time.sleep(30)
     tq = QueryTest(
         store,
         ('''START drone=node:Drone('*:*') '''
          '''WHERE drone.designation = "{0.hostname}" RETURN drone'''),
         GN.nodeconstructor,
         debug=debug)
     if tq.check([
             nano,
     ], downbyshutdown, maxrows=1):
         print 'WOOT! Systems passed query check after nano shutdown!'
     else:
         print 'Systems FAILED query check after nano shutdown'
Ejemplo n.º 28
0
 def run(self,
         nano=None,
         debug=None,
         timeout=240,
         service=None,
         monitorname=None):
     if debug is None:
         debug = self.debug
     if service is None:
         service = self.service
     if monitorname is None:
         monitorname = self.monitorname
     if nano is None:
         nanozero = self.testenviron.select_nano_noservice(service=service)
         if nanozero is None or len(nanozero) < 1:
             return self._record(AssimSysTest.SKIPPED)
         else:
             nano = nanozero[0]
     assert service not in nano.runningservices
     if SystemTestEnvironment.NANOSERVICE not in nano.runningservices:
         startregexes = self.nano_start_regexes(nano)
         watch = LogWatcher(self.logfilename,
                            startregexes,
                            timeout=timeout,
                            debug=debug)
         watch.setwatch()
         nano.startservice(SystemTestEnvironment.NANOSERVICE)
         match = watch.look(timeout=timeout)
         if match is None:
             logger(
                 'ERROR: Test %s timed out waiting for any of %s [timeout:%s]'
                 % (self.__class__.__name__, str(watch.regexes), timeout))
             return self._record(AssimSysTest.FAIL)
     regexes = self.nano_stop_regexes(nano)
     watch = LogWatcher(self.logfilename,
                        regexes,
                        timeout=timeout,
                        debug=debug)
     watch.setwatch()
     nano.stopservice(SystemTestEnvironment.NANOSERVICE)
     if watch.lookforall(timeout=timeout) is None:
         logger(
             'ERROR: Test %s timed out waiting for all of %s [timeout:%s]' %
             (self.__class__.__name__, str(watch.unmatched), timeout))
         return self._record(AssimSysTest.FAIL)
     regexes = self.nano_start_regexes(nano)
     regexes.extend(self.nano_startmonitor_regexes(nano, monitorname))
     regexes.extend(self.nano_service_start_regexes(nano, monitorname))
     watch = LogWatcher(self.logfilename,
                        regexes,
                        timeout=timeout,
                        debug=debug)
     watch.setwatch()
     nano.startservice(service)
     nano.startservice(SystemTestEnvironment.NANOSERVICE)
     if watch.lookforall(timeout=timeout) is None:
         logger(
             'ERROR: Test %s timed out waiting for all of %s [timeout:%s]' %
             (self.__class__.__name__, str(watch.unmatched), timeout))
         return self._record(AssimSysTest.FAIL)
     # @TODO make a better query
     # but it should be enough to let us validate the rest
     qstr = (
         '''START drone=node:Drone('*:*') '''
         '''WHERE drone.designation = "{0.hostname}" and drone.status = "up" '''
         '''RETURN drone''')
     return self.checkresults(watch, timeout, qstr, None, nano, debug=debug)
Ejemplo n.º 29
0
    print_log('INFO: Total write lines = ' + str(len(lines)))

def write2HDFS(localpath, hdfspath):
    print_log('INFO: Append local file to hdfs. ' + localpath + '-> hdfs://' + hdfspath )
    p = Popen([HDFS_BIN, 'dfs', '-appendToFile', localpath, hdfspath], stdout=PIPE, stderr=PIPE)
    result = p.stdout.read()
    if result:
        print result
    result_err = p.stderr.read()
    if result_err:
        print result_err

def makeHDFSDir(newpath):
    print_log('INFO: Make the HDFS dir = hdfs://' + newpath)
    p = Popen([HDFS_BIN, 'dfs', '-mkdir', newpath], stdout=PIPE, stderr=PIPE)
    result = p.stdout.read()
    if result:
        print result
    result_err = p.stderr.read()
    if result_err:
        print result_err

def print_log(msg):
    print time.strftime("%Y/%m/%d %H:%M:%S") + '\t' + msg

if __name__ == '__main__':
    watcher = LogWatcher(LOG_DIR + LOG_FILE, callback, True)
    watcher.loop(TIME_CYCLE_SEC)
    print "Exiting Main Thread"
Ejemplo n.º 30
0
def main():
    """Starts the LogWatcher process.

    :param String        dataPath:         Directory where the data will be stored.
    """
    lw = LogWatcher(dataPath)
Ejemplo n.º 31
0
class Provisioner():
    def __init__(self, vm_limit, azure_config, skip_setup, local):
        self.vm_limit = vm_limit # user input
        self.budget = 0
        self.timestamp = datetime.now()
        self.cost_pred = 0
        self.wf_end = None
        
        self.jobs_terminated = False
        self.last_resched = None
        
        self.workflow = Workflow()
        self.logwatcher = LogWatcher()
        
        self.schedule = Schedule()
        
        manager = Machine()
        manager.status = MachineStatus.manager
        manager.condor_slot = 'manager'
        self.machines = [manager]
        
        boot_entry = ScheduleEntry(Job('boot', None), manager, self.timestamp, self.timestamp)
        boot_entry.real_start = self.timestamp
        boot_entry.real_end = self.timestamp
        boot_entry.status = EntryStatus.completed
        self.schedule.add_entry_host(boot_entry, manager)
        
        self.local = local
        if azure_config and not local:
            hostname = socket.gethostname()
            self.exp = AzureExperiment(azure_config, skip_setup=skip_setup, name=hostname)
            self.master_addr = socket.gethostbyname(hostname)
            self.user = azure_config.admin_username
        else:
            self.exp = self.master_addr = self.user = None
        
    def add_workflow(self, workflow_dir, prediction_file, budget):
        self.budget = self.budget + int(round(float(budget)))
        wf_id = self.workflow.add_workflow(workflow_dir, prediction_file=prediction_file)
        self.logwatcher.add(wf_id, workflow_dir)
            
    def update_schedule(self):
        print 'UPDATE SCHED'
        self.update_budget_timestamp()
        self.last_resched = self.timestamp 
    
        # completed and running entries will not change
        self.schedule.rm_scheduled_entries()

        if self.workflow.has_jobs_to_sched(self.schedule):
            # Max number of vms
            nmax = get_nmax(self.workflow, self.machines, self.schedule, self.vm_limit, self.timestamp, self.local)

            print 'NMAX',nmax
            
            # Get the number of machines to be used
            schedule, _cost, _n = sched_number_of_machines(self.workflow, self.machines, self.schedule, nmax, self.timestamp, self.budget, self.local)
            print "N", _n, 'budget', self.budget
            
            # Update schedule
            self.schedule = schedule

    def update_budget_timestamp(self):
        timestamp = datetime.now()
        if self.timestamp != None:
            # Supondo vm_cost em cost/second
            # Supondo que não houve mudança no número de máquinas
            # desde o ultimo self.timestamp
            delta = (timestamp - self.timestamp).seconds
            charged = delta * len(self.machines) * VM_COST_PER_SEC
            self.budget = self.budget - charged
        self.timestamp = timestamp
        
    def update_wf_pred(self):
        self.cost_pred, self.wf_end = sched_cost_pred(self.machines, self.schedule, self.timestamp)

    def allocate_new_vms(self):
        # boot entries
        if self.schedule != None:
            for m in self.schedule.entries_host.keys():
                entry = self.schedule.entries_host[m][0]
                if entry.status == EntryStatus.scheduled and entry.start() <= self.timestamp:
                    m.allocate(self.exp, self.master_addr, self.user)
                    
                    self.machines.append(m)
                    entry.status = EntryStatus.executing
                    entry.log[LogKey.real_start] = self.timestamp
        
    
    def deallocate_vms(self):
        for m in self.machines:
            if m.status == MachineStatus.manager:
                continue
            
            # if there's no more budget or
            # if there's nothing executing or scheduled to the machine
            if self.schedule == None or len([e for e in self.schedule.entries_host[m] if e.status != EntryStatus.completed]) == 0:
                m.deallocate(self.exp)
                print "--Machine", m.condor_slot
                
        # update machine list
        self.machines = [m for m in self.machines if m.status != MachineStatus.deallocating]
    
    
    def sync_machines(self):
        slots_addrs = condor_slots()
        running_machines = [m for m in self.machines if m.status == MachineStatus.running]
        allocating_machines = [m for m in self.machines if m.status == MachineStatus.allocating]
        #allocating_machines.sort(key=lambda x: self.schedule.entries_host[x][0].start())
        i = 0
        for (slot,addr) in slots_addrs:
            if slot not in [m.condor_slot for m in running_machines]:
                allocated_machine = None
                if not self.local:
                    allocated_machine = next((m for m in allocating_machines if m.priv_addr == addr), None)
                elif len(allocating_machines[i:]) > 0:
                    # update machine
                    allocated_machine = allocating_machines[i]
                
                if allocated_machine:
                    allocated_machine.status = MachineStatus.running
                    allocated_machine.condor_slot = slot
                    
                    # update entry
                    boot_entry = self.schedule.entries_host[allocated_machine][0]
                    boot_entry.log[LogKey.real_end] = self.timestamp
                    boot_entry.status = EntryStatus.completed
                
                    i += 1
                    print "++Machine", allocated_machine.condor_slot
                else:
                    if next((e for e in self.schedule.entries if e.host.priv_addr == addr and e.status != EntryStatus.completed), None):
                        print "ERROR: slot not found", slot, addr, 'nr', len(running_machines), 'na', len(allocating_machines)
                

    
    def _handle_log_events(self):
        jobs_terminated = False
        log_entries = self.logwatcher.nexts()
        
        for le in log_entries:
            if le.id in self.schedule.entries_cid:
                sched_entry = self.schedule.entries_cid[le.id]
            else:
                sched_entry = next((e for e in self.schedule.entries if e.job.dag_job_id == le.name and e.job.wf_id == le.wf_id), None)
                if sched_entry:
                    sched_entry.condor_id = le.id
                    self.schedule.add_entry_cid(sched_entry)
            if sched_entry:
                sched_entry.log[le.event] = le.timestamp
                
                if le.event == LogKey.execute:
                    sched_entry.status = EntryStatus.executing
            
                elif le.event == LogKey.job_terminated:
                    sched_entry.status = EntryStatus.completed 
                    sched_entry.log[LogKey.real_end] = self.timestamp
                    print "--Job", le.id, sched_entry.job.dag_job_id, sched_entry.host.condor_slot
                    jobs_terminated = True
            else:
                print 'could not find sched_entry for:', le.id
        return jobs_terminated
                
    def _handle_ready_jobs(self):    
        need_condor_resched = False
        idle_cjobs = condor_idle() # idle jobs

        for cjob in idle_cjobs:
            condor_id, wf_id, dag_job_id = cjob.split()
            if condor_id in self.schedule.entries_cid:
                sched_entry = self.schedule.entries_cid[condor_id]
            else:
                sched_entry = next((e for e in self.schedule.entries \
                                    if e.job.dag_job_id == dag_job_id \
                                    and e.job.wf_id == wf_id ), None)
                if sched_entry:
                    sched_entry.condor_id = condor_id
                    self.schedule.add_entry_cid(sched_entry)

            if sched_entry and sched_entry.status == EntryStatus.scheduled \
                    and sched_entry.host.status == MachineStatus.running:
                sched_entry.status = EntryStatus.executing
                sched_entry.log[LogKey.real_start] = self.timestamp
                print "++Job", condor_id, dag_job_id, sched_entry.host.condor_slot
                condor_qedit(condor_id, wf_id, dag_job_id, sched_entry.host.condor_slot)
                need_condor_resched = True

        if need_condor_resched:
            condor_reschedule()

    def update_jobs(self):
        
        # handle log events and check if any job terminated
        self.jobs_terminated = self._handle_log_events() or self.jobs_terminated
        
        # need to update schedule (?)
        if self.last_resched and self.jobs_terminated and \
        ((self.timestamp - self.last_resched).seconds > SCHED_TIMEOUT):
            self.update_schedule()
            self.jobs_terminated = False
        
        # handle jobs that are ready to execute
        self._handle_ready_jobs()