def get_specific_jobs(credential, jobids): results = { "worked": [], "failed": [], "error": None, } try: results["cluster"] = credential.cluster.name credential.get_ssh_connection() except: results["error"] = "Invalid credential" results["cluster"] = None logger.info("Invalid credential %s" % credential) return results if not jobids: return results all_jobs = get_jobs([credential]) cluster_jobs = all_jobs[0] running_jobs = cluster_jobs["jobs"] running_jobids = [x[0] for x in cluster_jobs["jobs"]] for job in jobids: if job not in running_jobids: pair = (job, "That job number is not running.") results["failed"].append(pair) else: pair = (job, running_jobs[running_jobids.index(job)]) results["worked"].append(pair) return results
def get_specific_jobs(credential, jobids): results = { "worked": [], "failed": [], "error": None, } try: results["cluster"] = credential.cluster.name ssh = credential.get_ssh_connection() except: results["error"] = "Invalid credential" results["cluster"] = None logger.info("Invalid credential %s" % credential) return results if not jobids: return results all_jobs = get_jobs([credential]) cluster_jobs = all_jobs[0] running_jobs = cluster_jobs["jobs"] running_jobids = [x[0] for x in cluster_jobs["jobs"]] for job in jobids: if job not in running_jobids: pair = (job, "That job number is not running.") results["failed"].append(pair) else: pair = (job, running_jobs[running_jobids.index(job)]) results["worked"].append(pair) return results
def run_case2(c): r1 = CaseOneReport() r2 = CaseTwoReport() jobs = utils.get_jobs(JOB_NAME, JOB_COUNT) start = int(time.time()) while True: c.run_once() ts = int(time.time()) - start for job in jobs: c.update_job(job, ts) running_trainers = c.get_running_trainers() nginx_pods = c.get_running_pods({'app': 'nginx'}) item = CaseTwoItem(ts, nginx_pods, running_trainers, c) if DETAILS: print_info(ts, c, jobs, nginx_pods) r1.update_cluster_utils(c) r2.append_item(item) if utils.is_jobs_killed(jobs): r1.update_jobs(jobs) r1.run() r2.to_csv('%s/%s-case2-result.csv' % (outdir, JOB_NAME)) r1.to_csv('%s/%s-case1-pass%d.csv' % (outdir, JOB_NAME, PASSE_NUM)) break
def wait_for_finished(c): jobs = utils.get_jobs(JOB_NAME, JOB_COUNT) while True: c.run_once() for job in jobs: c.update_job(job, 0) if utils.is_jobs_finished(jobs): print 'All the jobs have already finished' return print 'Waiting for all the jobs finsihed for 5 seconds...' time.sleep(5)
def get_all_jobs(credentials): updates = [Job.get_oldest_update_time(credential=x) for x in credentials] # TUNABLE: change time constraint if any((timezone.now() - x).seconds > 60 for x in updates): jobs = get_jobs(credentials) else: jobs = [] for cred in credentials: objs = Job.get_running_jobs(credential=cred) jobs.append({ "name": cred.cluster.name, "columns": WANTED_COLS, "jobs": [x.format() for x in objs], }) return jobs
def get_all_jobs(user, cluster=None): if cluster: creds = user.credentials.filter(cluster__name__iexact=cluster) else: creds = user.credentials.all() # TUNABLE: change time constraint temp = Job.get_oldest_update_time(user=user) if (timezone.now() - temp).seconds > 60: jobs = get_jobs(creds) else: jobs = [] for cred in creds: objs = Job.get_running_jobs() temp = [x.format() for x in objs] jobs.append({ "name": cred.cluster.name, "columns": WANTED_COLS, "jobs": temp, }) return jobs
def run_case1(c): report = CaseOneReport() jobs = utils.get_jobs(JOB_NAME, JOB_COUNT) start = int(time.time()) while True: ts = int(time.time()) - start c.run_once() for job in jobs: c.update_job(job, ts) report.update_cluster_utils(c) if DETAILS: print_info(ts, c, jobs, 0) if utils.is_jobs_killed(jobs): report.update_jobs(jobs) report.run() report.to_csv('%s/%s-case1-pass%d.csv' % (outdir, JOB_NAME, PASSE_NUM)) break