Exemple #1
0
 def test_finish(self):
     job = Job.create(db.session, **self.opts)
     # should not be able to finish not running job
     with self.assertRaises(StandardError):
         Job.finish(db.session, job)
     # Launch job and finish it
     Job.run(db.session, job)
     Job.finish(db.session, job)
     self.assertEqual(job.status, Job.FINISHED)
     self.assertTrue(job.finishtime > utils.currentTimeMillis() - 2000)
     # should fail to finish already finished job
     with self.assertRaises(StandardError):
         Job.finish(db.session, job)
Exemple #2
0
def action(sampler):
    if not sampler:
        scheduler.logger.error("Sampler is undefined, exiting")
        return
    session = SignallingSession(db)
    try:
        sampler.logger.info("Start refreshing application state")
        sampler.incrementNumRuns()
        lock.acquire()
        for uid, pid in sampler.pool.items():
            if updateProcessStatus(pid) >= 0:
                job = Job.get(session, uid)
                if not job:
                    sampler.logger.warn("Job '%s' does not exist in database, updated skipped", uid)
                else:
                    Job.finish(session, job)
                sampler.removeFromPool(uid)
            else:
                sampler.logger.info("Process '%s' is still running, job uid: '%s'", pid, uid)
        # Check how many pids are left. Compare against NUM_SLOTS, if comparison yields false,
        # skip execution, otherwise it yields true, and we proceed with number of free slots
        freeSlots = NUM_SLOTS - len(sampler.pool)
        if freeSlots <= 0:
            sampler.logger.info("All %s slots are taken, cannot launch job, skipped", NUM_SLOTS)
            sampler.logger.debug("Free slots: %s, pool size: %s, numSlots: %s", freeSlots,
                len(sampler.pool), NUM_SLOTS)
        else:
            # Check how many jobs are running at the moment by checking status of the cluster and
            # requesting number of running applications, if number of applications is equal or more
            # than NUM_SLOTS, skip execution, otherwise compute number of jobs to launch and proceed.
            sparkStatus = sparkContext.clusterStatus()
            if sparkStatus == DOWN:
                sampler.logger.info("Cluster %s[%s] is down, will try again later",
                    sparkContext.getMasterAddress(), sparkContext.getUiAddress())
            else:
                apps = sparkContext.clusterRunningApps()
                freeSlots = NUM_SLOTS - len(apps)
                if freeSlots <= 0:
                    sampler.logger.info("There are %s applications running already, cannot " + \
                        "launch job, skipped", len(apps))
                    sampler.logger.debug("Free slots: %s, apps: %s, numSlots: %s", freeSlots,
                        len(apps), NUM_SLOTS)
                else:
                    # Fetch jobs active (runnable) jobs using Job API based on number of free slots,
                    # acquired earlier. Start jobs in the list, if any. Report when no jobs found.
                    currentTime = utils.currentTimeMillis()
                    sampler.logger.debug("Fetch jobs with session %s, free slots %s, time %s",
                        session, freeSlots, currentTime)
                    runnableJobs = Job.listRunnable(session, freeSlots, currentTime)
                    sampler.logger.info("Registering %s jobs", len(runnableJobs))
                    for job in runnableJobs:
                        pid = launchSparkJob(job)
                        Job.run(session, job)
                        sampler.addToPool(job.uid, pid)

        session.commit()
    except Exception as e:
        sampler.logger.error("Sampler encountered error, execution skipped")
        sampler.logger.exception(e.message)
    finally:
        lock.release()
        session.close()
        if sampler.enabled:
            sampler.logger.debug("Prepared to be invoked in %s seconds", sampler.interval)
            timer = Timer(sampler.interval, action, [sampler])
            timer.daemon = True
            timer.start()
        else:
            sampler.logger.info("Sampler stopped")