def try_to_complete(self): """ Try to force the task to complete (e.g., through min_completion_fraction satisfaction or otherwise), and also do so by removing residual condor jobs and deleting output files that aren't explicitly done but may have been put there in the meantime by a condor job """ # if min_completion_fraction is 1, then don't do anything if self.min_completion_fraction > 1. - 1.e-3: return # if it's not complete by the min_completion_fraction standard, then # don't even bother killing tail jobs. if not self.complete(): return for cjob in self.get_running_condor_jobs(): cluster_id = cjob["ClusterId"] Utils.condor_rm([cluster_id]) self.logger.info("Tail condor job {} removed".format(cluster_id)) files_to_remove = [ output.get_name() for output in self.get_uncompleted_outputs() ] new_mapping = [] for ins, out in self.get_io_mapping(): if out in files_to_remove: continue new_mapping.append([ins, out]) for fname in files_to_remove: Utils.do_cmd("rm {}".format(fname)) self.logger.info("Tail root file {} removed".format(fname)) self.io_mapping = new_mapping
def handle_condor_job(self, this_job_dict, out, fake=False, remove_running_x_hours=48.0, remove_held_x_hours=5.0): """ takes `out` (File object) and dictionary of condor job information returns action_type specifying the type of action taken given the info """ cluster_id = "{}".format(this_job_dict["ClusterId"]) running = this_job_dict.get("JobStatus", "I") == "R" idle = this_job_dict.get("JobStatus", "I") == "I" held = this_job_dict.get("JobStatus", "I") == "H" hours_since = abs(time.time() - int(this_job_dict["EnteredCurrentStatus"])) / 3600. action_type = "UNKNOWN" out.set_status(Constants.RUNNING) if running: self.logger.debug( "Job {0} for ({1}) running for {2:.1f} hrs".format( cluster_id, out, hours_since)) action_type = "RUNNING" out.set_status(Constants.RUNNING) if hours_since > remove_running_x_hours: self.logger.debug( "Job {0} for ({1}) removed for running for more than a day!" .format(cluster_id, out)) if not fake: Utils.condor_rm([cluster_id]) action_type = "LONG_RUNNING_REMOVED" elif idle: self.logger.debug("Job {0} for ({1}) idle for {2:.1f} hrs".format( cluster_id, out, hours_since)) action_type = "IDLE" out.set_status(Constants.IDLE) elif held: self.logger.debug( "Job {0} for ({1}) held for {2:.1f} hrs with hold reason: {3}". format(cluster_id, out, hours_since, this_job_dict.get("HoldReason", "???"))) action_type = "HELD" out.set_status(Constants.HELD) if hours_since > remove_held_x_hours: self.logger.info( "Job {0} for ({1}) removed for excessive hold time".format( cluster_id, out)) if not fake: Utils.condor_rm([cluster_id]) action_type = "HELD_AND_REMOVED" return action_type
def test_condor_submission_and_status(self): basedir = "/tmp/{0}/metis/condor_test/".format(os.getenv("USER")) Utils.do_cmd("mkdir -p {0}".format(basedir)) with open("{0}/temp_test.sh".format(basedir), "w") as fhout: fhout.write("""#!/usr/bin/env bash echo "--- begin header output ---" echo "hostname: $(hostname)" echo "uname -a: $(uname -a)" echo "time: $(date +%s)" echo "args: $@" echo "ls -l output" ls -l # logging every 45 seconds gives ~100kb log file/3 hours dstat -cdngytlmrs --float --nocolor -T --output dsout.csv 45 >& /dev/null & echo "--- end header output ---" # run main job stuff sleep 60s echo "--- begin dstat output ---" cat dsout.csv echo "--- end dstat output ---" kill %1 # kill dstat echo "ls -l output" ls -l """) Utils.do_cmd("chmod a+x {0}/temp_test.sh".format(basedir)) success, cluster_id = Utils.condor_submit( executable=basedir + "temp_test.sh", arguments=["cat", 10, "foo"], inputfiles=[], logdir=basedir, selection_pairs=[["MyVar1", "METIS_TEST"], ["MyVar2", "METIS_TEST2"]]) jobs = Utils.condor_q(selection_pairs=[["MyVar1", "METIS_TEST"], ["MyVar2", "METIS_TEST2"]]) found_job = len(jobs) >= 1 Utils.condor_rm([cluster_id]) self.assertEqual(success, True) self.assertEqual(found_job, True)