def test_flush(self): basedir = "/tmp/{0}/metis/condortask_testflush/".format( os.getenv("USER")) Utils.do_cmd("mkdir -p {0}".format(basedir)) tag = "vflush" for i in range(1, self.nfiles + 1): Utils.do_cmd("touch {0}/input_{1}.root".format(basedir, i)) dummy = CondorTask( sample=DirectorySample( location=basedir, globber="*.root", dataset="/test/test/TEST", ), open_dataset=True, files_per_output=self.files_per_job, cmssw_version=self.cmssw, tag=tag, ) self.assertEqual(len(dummy.get_outputs()), (self.nfiles // self.files_per_job)) dummy.flush() self.assertEqual(len(dummy.get_outputs()), (self.nfiles // self.files_per_job + 1))
def try_to_complete(self): """ Try to force the task to complete (e.g., through min_completion_fraction satisfaction or otherwise), and also do so by removing residual condor jobs and deleting output files that aren't explicitly done but may have been put there in the meantime by a condor job """ # if min_completion_fraction is 1, then don't do anything if self.min_completion_fraction > 1. - 1.e-3: return # if it's not complete by the min_completion_fraction standard, then # don't even bother killing tail jobs. if not self.complete(): return for cjob in self.get_running_condor_jobs(): cluster_id = cjob["ClusterId"] Utils.condor_rm([cluster_id]) self.logger.info("Tail condor job {} removed".format(cluster_id)) files_to_remove = [ output.get_name() for output in self.get_uncompleted_outputs() ] new_mapping = [] for ins, out in self.get_io_mapping(): if out in files_to_remove: continue new_mapping.append([ins, out]) for fname in files_to_remove: Utils.do_cmd("rm {}".format(fname)) self.logger.info("Tail root file {} removed".format(fname)) self.io_mapping = new_mapping
def test_timestamps(self): now = datetime.datetime.now() timestamp = int(now.strftime("%s")) self.assertEqual(abs(Utils.get_timestamp() - timestamp) < 2, True) self.assertEqual( int(Utils.from_timestamp(now.strftime("%s")).strftime("%s")), timestamp)
def test_file_chunker(self): files = [ EventsFile("blah1.root", nevents=100), EventsFile("blah2.root", nevents=200), EventsFile("blah3.root", nevents=300), EventsFile("blah4.root", nevents=100), EventsFile("blah5.root", nevents=200), EventsFile("blah6.root", nevents=300), ] chunks, leftoverchunk = Utils.file_chunker(files, events_per_output=300, flush=True) self.assertEqual((len(chunks), len(leftoverchunk)), (4, 0)) chunks, leftoverchunk = Utils.file_chunker(files, events_per_output=300, flush=False) self.assertEqual((len(chunks), len(leftoverchunk)), (3, 1)) chunks, leftoverchunk = Utils.file_chunker(files, files_per_output=4, flush=True) self.assertEqual((len(chunks), len(leftoverchunk)), (2, 0)) chunks, leftoverchunk = Utils.file_chunker(files, files_per_output=4, flush=False) self.assertEqual((len(chunks), len(leftoverchunk)), (1, 2))
def test_gfal_copy(self): outname = "gfaltest.root" basedir = "/hadoop/cms/store/user/{0}/metis_test".format( os.environ.get("GRIDUSER", os.environ.get("USER"))) outfile = "{0}/{1}".format(basedir, outname) outfilestore = outfile.replace("/hadoop/cms", "") for outfinal, url in [ (outfilestore, "davs://redirector.t2.ucsd.edu:1094"), (outfile, "gsiftp://gftp.t2.ucsd.edu"), ]: cmd = """ seq 1 3 > {outname}; rm -f {outfile}; env -i X509_USER_PROXY=/tmp/x509up_u`id -u` gfal-copy -p -f -t 4200 --verbose file://`pwd`/{outname} {url}{outfinal} --checksum ADLER32 """.format( url=url, outname=outname, outfile=outfile, outfinal=outfinal) stat, out = Utils.do_cmd(cmd, returnStatus=True) exists = os.path.exists(outfile) if not exists: print("gfal-copy using {url} failed with ----------------->". format(url=url)) print(out) print("<---------------------------------------") cmd = "rm -f {outfile} ; rm -f {outname}".format(outname=outname, outfile=outfile) Utils.do_cmd(cmd) self.assertEqual(exists, True)
def write_metadata(self, d_metadata): metadata_file = d_metadata["finaldir"] + "/metadata.json" with open(metadata_file, "w") as fhout: json.dump(d_metadata, fhout, sort_keys=True, indent=4) # self.logger.info("Dumped metadata to {0}".format(metadata_file)) Utils.do_cmd("cp {0}/backup.pkl {1}/".format(self.get_taskdir(), d_metadata["finaldir"])) self.logger.info("Dumped metadata and backup pickle")
def handle_condor_job(self, this_job_dict, out, fake=False, remove_running_x_hours=48.0, remove_held_x_hours=5.0): """ takes `out` (File object) and dictionary of condor job information returns action_type specifying the type of action taken given the info """ cluster_id = "{}".format(this_job_dict["ClusterId"]) running = this_job_dict.get("JobStatus", "I") == "R" idle = this_job_dict.get("JobStatus", "I") == "I" held = this_job_dict.get("JobStatus", "I") == "H" hours_since = abs(time.time() - int(this_job_dict["EnteredCurrentStatus"])) / 3600. action_type = "UNKNOWN" out.set_status(Constants.RUNNING) if running: self.logger.debug( "Job {0} for ({1}) running for {2:.1f} hrs".format( cluster_id, out, hours_since)) action_type = "RUNNING" out.set_status(Constants.RUNNING) if hours_since > remove_running_x_hours: self.logger.debug( "Job {0} for ({1}) removed for running for more than a day!" .format(cluster_id, out)) if not fake: Utils.condor_rm([cluster_id]) action_type = "LONG_RUNNING_REMOVED" elif idle: self.logger.debug("Job {0} for ({1}) idle for {2:.1f} hrs".format( cluster_id, out, hours_since)) action_type = "IDLE" out.set_status(Constants.IDLE) elif held: self.logger.debug( "Job {0} for ({1}) held for {2:.1f} hrs with hold reason: {3}". format(cluster_id, out, hours_since, this_job_dict.get("HoldReason", "???"))) action_type = "HELD" out.set_status(Constants.HELD) if hours_since > remove_held_x_hours: self.logger.info( "Job {0} for ({1}) removed for excessive hold time".format( cluster_id, out)) if not fake: Utils.condor_rm([cluster_id]) action_type = "HELD_AND_REMOVED" return action_type
def make_dashboard(self, d_web_summary): with Utils.locked_open(self.SUMMARY_NAME, 'w') as fhout: json.dump(d_web_summary, fhout, sort_keys=True, indent=4, separators=(',', ': ')) # fhout.write(json.dumps(d_web_summary, sort_keys = True, indent = 4, separators=(',',': '), cls=CustomEncoder)) Utils.update_dashboard(webdir=self.webdir, jsonfile=self.SUMMARY_NAME)
def test_timedelta_to_human(self): self.assertEqual(Utils.timedelta_to_human(datetime.timedelta(days=3)), "3 days") self.assertEqual( Utils.timedelta_to_human(datetime.timedelta(days=3.5)), "3 days") self.assertEqual( Utils.timedelta_to_human(datetime.timedelta(days=0.5)), "12 hours") self.assertEqual( Utils.timedelta_to_human(datetime.timedelta(days=0.49)), "11 hours") self.assertEqual( Utils.timedelta_to_human(datetime.timedelta(days=1.5)), "36 hours")
def merge_function(self, inputs, output): # make the directory hosting the output if it doesn't exist fdir = output.get_basepath() if not os.path.exists(fdir): Utils.do_cmd("mkdir -p {0}".format(fdir)) # when merging 1 file, TFileMerger defaults to a special case # of just copying the file. this screws up because of an issue # in TUrl and leaves potentially big files in /tmp/ without cleaning # them up later, so do it nonlocally, sigh :( local = True if len(inputs) == 1: local = False if len(inputs) < 5: self.show_progress = False fm = r.TFileMerger(local) fm.OutputFile(output.get_name()) fm.SetFastMethod(True) fm.SetMaxOpenedFiles(400) fm.SetPrintLevel(0) ngood = 0 ntotal = len(inputs) self.logger.info("Adding {0} files to be merged".format(ntotal)) if self.show_progress: try: from tqdm import tqdm inputs = tqdm(inputs) except: pass t0 = time.time() for inp in inputs: if self.ignore_bad: if not inp.exists(): continue ngood += fm.AddFile(inp.get_name(), False) if self.show_progress: fm.PartialMerge(r.TFileMerger.kIncremental | r.TFileMerger.kAll) if not self.ignore_bad and (ngood != ntotal): MutableFile(output).rm() raise RuntimeError( "Tried to merge {0} files into {1}, but only {2} of them got included properly" .format(len(inputs), output.get_name(), ngood)) if not self.show_progress: fm.Merge() t1 = time.time() sizemb = output.get_filesizeMB() self.logger.info( "Done merging files into {} ({:.1f}MB). Took {:.2f} secs @ {:.1f}MB/s" .format(output.get_name(), sizemb, t1 - t0, sizemb / (t1 - t0)))
def test_condor_submit_template_grid(self): template = Utils.condor_submit( executable="blah.sh", arguments=[], inputfiles=[], logdir="./", return_template=True, sites="UAF,T2_US_UCSD", ) self.assertEqual("executable=blah.sh" in template, True) self.assertEqual("UAF,T2_US_UCSD" in template, True) self.assertEqual( "x509userproxy={0}".format(Utils.get_proxy_file()) in template, True)
def test_make_tar(self): basedir = "/tmp/{0}/metis/tar_test/".format(os.getenv("USER")) tarname = "{0}/test.tar.gz".format(basedir) textname = "{0}/test.txt".format(basedir) Utils.do_cmd("mkdir -p {0}".format(basedir)) ut = UserTarball.UserTarball(name=tarname) Utils.do_cmd("echo check > {0}".format(textname)) ut.tarfile.add(textname) ut.close() self.assertEqual( Utils.do_cmd("tar xzOf {0}".format(tarname)).strip(), "check")
def prepare_inputs(self): # need to take care of executable, tarfile self.executable_path = "{0}/executable.sh".format(self.get_taskdir()) self.package_path = "{0}/package.tar.gz".format(self.get_taskdir()) # take care of executable. easy. Utils.do_cmd("cp {0} {1}".format(self.input_executable, self.executable_path)) # take care of package tar file if we were told to. easy. if self.tarfile: Utils.do_cmd("cp {0} {1}".format(self.tarfile, self.package_path)) self.prepared_inputs = True
def __init__(self, data = {}, summary_fname="../summary.json"): self.data = data self.summary_fname = summary_fname if not self.data: with Utils.locked_open(self.summary_fname,"r") as fhin: self.data = json.load(fhin)
def __init__(self, data={}, summary_fname="summary.json", webdir="~/public_html/dump/metis_test/", do_history=True, make_plots=False): self.data = data self.summary_fname = summary_fname self.webdir = webdir self.SUMMARY_NAME = "web_summary.json" self.do_history = do_history self.logger = logging.getLogger(Utils.setup_logger()) self.make_plots = make_plots if not self.data: with Utils.locked_open(self.summary_fname, "r") as fhin: self.data = json.load(fhin)
def test_condor_submission_output_local(self): """ This test actually submits a condor job to the local universe and checks the output. To deal with delays, a 10s sleep is introduced, so skip this if end-to-end condor testing isn't needed """ basedir = "/tmp/{0}/metis/condor_test/".format(os.getenv("USER")) Utils.do_cmd("mkdir -p {0}".format(basedir)) test_file = "{0}/super_secret_file_for_test.txt".format(basedir) Utils.do_cmd("rm {0}".format(test_file)) with open("{0}/temp_test_local.sh".format(basedir), "w") as fhout: fhout.write("""#!/usr/bin/env bash echo "Metis" touch {0} """.format(test_file)) Utils.do_cmd("chmod a+x {0}/temp_test_local.sh".format(basedir)) success, cluster_id = Utils.condor_submit(executable=basedir + "temp_test_local.sh", arguments=[], inputfiles=[], logdir=basedir, universe="local") found_it = False for t in [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 5.0, 10.0]: time.sleep(t) if os.path.exists(test_file): found_it = True break self.assertEqual(found_it, True)
def test_condor_submission_output_local_multiple(self): """ Save as `test_condor_submission_output_local` but for multiple jobs within a single submit file/cluster_id """ basedir = "/tmp/{0}/metis/condor_test_multiple/".format( os.getenv("USER")) Utils.do_cmd("mkdir -p {0}".format(basedir)) test_file = "{0}/super_secret_file_for_test.txt".format(basedir) Utils.do_cmd("rm {0}".format(test_file)) with open("{0}/temp_test_local.sh".format(basedir), "w") as fhout: fhout.write("""#!/usr/bin/env bash echo "Metis" touch {0} """.format(test_file)) Utils.do_cmd("chmod a+x {0}/temp_test_local.sh".format(basedir)) success, cluster_id = Utils.condor_submit(executable=basedir + "temp_test_local.sh", arguments=[[1, 2], [3, 4]], inputfiles=[], logdir=basedir, universe="local", multiple=True) found_it = False for t in [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 5.0, 10.0]: time.sleep(t) if os.path.exists(test_file): found_it = True break self.assertEqual(found_it, True)
def test_statistics(self): res = { 'maximum': 3, 'totsum': 6, 'length': 3, 'minimum': 1, 'sigma': 1.0, 'mean': 2.0 } self.assertEqual(Utils.get_stats([1, 2, 3]), res)
def test_condor_submit_template_uaf(self): template = Utils.condor_submit( executable="blah.sh", arguments=[], inputfiles=[], logdir="./", return_template=True, sites="UAF", ) self.assertEqual("executable=blah.sh" in template, True) self.assertEqual("UAF" in template, True)
def test_condor_submit_fake(self): self.assertEqual success, cluster_id = Utils.condor_submit( executable="blah.sh", arguments=[], inputfiles=[], logdir="./", fake=True, ) self.assertEqual(success, True) self.assertEqual(cluster_id, -1)
def submit_multiple_condor_jobs(self, v_ins, v_out, fake=False, optimizer=None): outdir = self.output_dir outname_noext = self.output_name.rsplit(".", 1)[0] v_inputs_commasep = [ ",".join(map(lambda x: x.get_name(), ins)) for ins in v_ins ] v_index = [out.get_index() for out in v_out] cmssw_ver = self.cmssw_version scramarch = self.scram_arch executable = self.executable_path v_arguments = [[ outdir, outname_noext, inputs_commasep, index, cmssw_ver, scramarch, self.arguments ] for (index, inputs_commasep) in zip(v_index, v_inputs_commasep)] if optimizer: v_sites = optimizer.get_sites(self, v_ins, v_out) v_selection_pairs = [[ ["taskname", self.unique_name], ["jobnum", index], ["tag", self.tag], [ "metis_retries", len(self.job_submission_history.get(index, [])) ], ["DESIRED_Sites", sites], ] for index, sites in zip(v_index, v_sites)] else: v_selection_pairs = [[ ["taskname", self.unique_name], ["jobnum", index], ["tag", self.tag], [ "metis_retries", len(self.job_submission_history.get(index, [])) ], ] for index in v_index] logdir_full = os.path.abspath("{0}/logs/".format(self.get_taskdir())) package_full = os.path.abspath(self.package_path) input_files = [package_full] if self.tarfile else [] input_files += self.additional_input_files extra = self.kwargs.get("condor_submit_params", {}) return Utils.condor_submit(executable=executable, arguments=v_arguments, inputfiles=input_files, logdir=logdir_full, selection_pairs=v_selection_pairs, multiple=True, fake=fake, **extra)
def test_everything(self): nfiles = 5 tag = "v1" dsname = "/DummyDataset/Dummy/TEST" basedir = "/tmp/{0}/metis/sntsample_test/".format(os.getenv("USER")) # make a directory, touch <nfiles> files Utils.do_cmd("mkdir -p {0} ; rm {0}/*.root".format(basedir)) for i in range(1, nfiles + 1): Utils.do_cmd("touch {0}/output_{1}.root".format(basedir, i)) # push a dummy dataset to DIS using the dummy location # and make sure we updated the sample without problems dummy = SNTSample( dataset=dsname, tag=tag, read_only=True, # note that this is the default! ) dummy.info["location"] = basedir dummy.info["nevents"] = 123 dummy.info["gtag"] = "stupidtag" # will fail the first time, since it's read only updated = dummy.do_update_dis() self.assertEqual(updated, False) # flip the bool and updating should succeed dummy.read_only = False updated = dummy.do_update_dis() self.assertEqual(updated, True) # make a new sample, retrieve from DIS, and check # that the location was written properly check = SNTSample( dataset=dsname, tag=tag, ) self.assertEqual(len(check.get_files()), nfiles) self.assertEqual(check.get_globaltag(), dummy.info["gtag"]) self.assertEqual(check.get_nevents(), dummy.info["nevents"]) self.assertEqual(check.get_location(), basedir)
def test_gfal_copy(self): outname = "gfaltest.root" basedir = "/hadoop/cms/store/user/{0}/metis_test".format( os.environ.get("GRIDUSER", os.environ.get("USER"))) outfile = "{0}/{1}".format(basedir, outname) cmd = """ touch {outname}; rm -f {outfile}; env -i X509_USER_PROXY=/tmp/x509up_u`id -u` gfal-copy -p -f -t 4200 --verbose file://`pwd`/{outname} gsiftp://gftp.t2.ucsd.edu{outfile} --checksum ADLER32 """.format( outname=outname, outfile=outfile) stat, out = Utils.do_cmd(cmd, returnStatus=True) exists = os.path.exists(outfile) if not exists: print("gfal-copy failed with ----------------->") print(out) print("<---------------------------------------") cmd = "rm -f {outfile} ; rm -f {outname}".format(outname=outname, outfile=outfile) Utils.do_cmd(cmd) self.assertEqual(exists, True)
def setUpClass(cls): super(CondorTaskTest, cls).setUpClass() # make a test directory and touch some root files and executable there basedir = "/tmp/{0}/metis/condortask_test/".format(os.getenv("USER")) Utils.do_cmd("mkdir -p {0}".format(basedir)) for i in range(1, cls.nfiles + 1): Utils.do_cmd("touch {0}/input_{1}.root".format(basedir, i)) Utils.do_cmd("echo hello > {0}/executable.sh".format(basedir)) # make dummy CondorTask with the files we # touched in the basedir, and chunk # the outputs logging.getLogger("logger_metis").disabled = True cls.dummy = CondorTask(sample=DirectorySample( location=basedir, globber="*.root", dataset="/test/test/TEST", ), open_dataset=False, files_per_output=cls.files_per_job, cmssw_version=cls.cmssw, tag=cls.tag, executable="{0}/executable.sh".format(basedir)) # prepare inputs and run, # but pretend like outputs exist and don't submit cls.dummy.prepare_inputs() # run once to "submit to condor" and "create outputs" (set_fake) cls.dummy.run(fake=True) # run again to recognize that all outputs are there and # we can then declare completion cls.dummy.run(fake=True)
def get_running_condor_jobs(self, extra_columns=[]): """ Get list of dictionaries for condor jobs satisfying the classad given by the unique_name, requesting an extra column for the second classad that we submitted the job with (the job number) I.e., each task has the same taskname and each job within a task has a unique job num corresponding to the output file index """ return Utils.condor_q(selection_pairs=[["taskname", self.unique_name]], extra_columns=["jobnum"] + extra_columns, use_python_bindings=True)
def test_singularity_container_switches(self): template = Utils.condor_submit( executable="blah.sh", arguments=[], inputfiles=[], logdir="./", return_template=True, sites="UAF,T2_US_UCSD", container=None, ) self.assertEqual("+SingularityContainer" in template, False) container = "/cvmfs/singularity.opensciencegrid.org/bbockelm/cms:rhel7" template = Utils.condor_submit( executable="blah.sh", arguments=[], inputfiles=[], logdir="./", return_template=True, sites="UAF,T2_US_UCSD", container=container, ) self.assertEqual("+SingularityImage" in template, True) self.assertEqual(container in template, True)
def test_condor_submission_and_status(self): basedir = "/tmp/{0}/metis/condor_test/".format(os.getenv("USER")) Utils.do_cmd("mkdir -p {0}".format(basedir)) with open("{0}/temp_test.sh".format(basedir), "w") as fhout: fhout.write("""#!/usr/bin/env bash echo "--- begin header output ---" echo "hostname: $(hostname)" echo "uname -a: $(uname -a)" echo "time: $(date +%s)" echo "args: $@" echo "ls -l output" ls -l # logging every 45 seconds gives ~100kb log file/3 hours dstat -cdngytlmrs --float --nocolor -T --output dsout.csv 45 >& /dev/null & echo "--- end header output ---" # run main job stuff sleep 60s echo "--- begin dstat output ---" cat dsout.csv echo "--- end dstat output ---" kill %1 # kill dstat echo "ls -l output" ls -l """) Utils.do_cmd("chmod a+x {0}/temp_test.sh".format(basedir)) success, cluster_id = Utils.condor_submit( executable=basedir + "temp_test.sh", arguments=["cat", 10, "foo"], inputfiles=[], logdir=basedir, selection_pairs=[["MyVar1", "METIS_TEST"], ["MyVar2", "METIS_TEST2"]]) jobs = Utils.condor_q(selection_pairs=[["MyVar1", "METIS_TEST"], ["MyVar2", "METIS_TEST2"]]) found_job = len(jobs) >= 1 Utils.condor_rm([cluster_id]) self.assertEqual(success, True) self.assertEqual(found_job, True)
def test_condor_submit_template_multiple(self): template = Utils.condor_submit( executable="blah.sh", inputfiles=[], arguments=[[1, 2], [3, 4], [5, 6]], selection_pairs=[ [["jobnum", "1"], ["taskname", "test"]], [["jobnum", "2"], ["taskname", "test"]], [["jobnum", "3"], ["taskname", "test"]], ], logdir="./", return_template=True, sites="UAF,T2_US_UCSD", multiple=True, ) self.assertEqual(template.count("arguments"), 3) self.assertEqual(template.count("queue"), 3)
def test_full(self): """ Touch a root file ("input") Submit condor jobs to touch output files for each input file and copy them to hadoop Jobs get submitted to local universe for speed reasons Check output to make sure job completed """ njobs = 2 cmssw = "CMSSW_8_0_21" basedir = "/tmp/{0}/metis/condortask_testfull/".format( os.getenv("USER")) Utils.do_cmd("mkdir -p {0}".format(basedir)) tag = "vfull" for i in range(1, njobs + 1): Utils.do_cmd("touch {0}/input_{1}.root".format(basedir, i)) logging.getLogger("logger_metis").disabled = True dummy = CondorTask( sample=DirectorySample( location=basedir, globber="*.root", dataset="/test/test/TEST", ), open_dataset=False, files_per_output=1, cmssw_version=cmssw, executable=Utils.metis_base() + "metis/executables/condor_test_exe.sh", tag=tag, condor_submit_params={"universe": "local"}, no_load_from_backup=True, ) # clean up previous directory Utils.do_cmd("rm -rf {0}".format(dummy.get_outputdir())) is_complete = False for t in [1.0, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0]: dummy.process() time.sleep(t) is_complete = dummy.complete() if is_complete: break self.assertEquals(is_complete, True) self.assertEqual(njobs, len(glob.glob(dummy.get_outputdir() + "/*")))
def test_recheck_real(self): # make a test file basedir = "/tmp/{0}/metis/file_test/".format(os.getenv("USER")) fname = "{0}/test.txt".format(basedir) Utils.do_cmd("mkdir -p {0}".format(basedir)) Utils.do_cmd("touch {0}".format(fname)) f = File(fname) # it exists self.assertEqual(f.exists(), True) # delete it Utils.do_cmd("rm {0}".format(fname)) # it still exists due to caching (to avoid unnecessary `ls`) self.assertEqual(f.exists(), True) # force recheck/recache f.recheck() # now it doesn't exist self.assertEqual(f.exists(), False)