def the_job(test_dir, resources): job_script = format_script("#!/usr/bin/python3\n" + textwrap.dedent(""" import os import sys import time elapsed = 0; while elapsed < int(sys.argv[1]):""" + "".join(f""" os.system('condor_status -ads ${{_CONDOR_SCRATCH_DIR}}/.update.ad -af Assigned{resource}s {resource}sMemoryUsage') """ for resource in resources) + """ time.sleep(1) elapsed += 1 """)) script_file = test_dir / "poll-memory.py" write_file(script_file, job_script) job_spec = { "executable": script_file.as_posix(), "arguments": "17", "log": (test_dir / "events.log").as_posix(), "output": (test_dir / "poll-memory.$(Cluster).$(Process).out").as_posix(), "error": (test_dir / "poll-memory.$(Cluster).$(Process).err").as_posix(), "getenv": "true", "LeaveJobInQueue": "true", } for resource in resources: job_spec[f"request_{resource}s"] = "2" return job_spec
def condor(test_dir, slot_config): for resource in resources.keys(): sequence = { f"{resource}{i}": j for i, j in enumerate(usages[resource]) } discovery_script = format_script(discovery_script_for(resource, sequence)) write_file(test_dir / f"{resource}-discovery.py", discovery_script) sequences = { f"{resource}{i}": j for i, j in enumerate(peaks[resource]) } monitor_script = both_monitor_script(resource, sequence, sequences) write_file(test_dir / f"{resource}-monitor.py", monitor_script) with Condor( local_dir=test_dir / "condor", config={**slot_config, "TEST_DIR": test_dir.as_posix()}, ) as condor: # Ornithology will run condor_who to verify that all the daemons are running, # but occasionally, not all slots will have made it to the collector num_slots = int(slot_config["NUM_SLOTS"]) loop_count = 0 while num_slots != len(condor.status(ad_type=htcondor.AdTypes.Startd, projection=["SlotID"])): loop_count = loop_count + 1 assert(loop_count < 20) time.sleep(1) yield condor
def path_to_exit_one(test_dir): exit_one_script = '''#!/bin/bash echo "exit_one = 1" exit 1 ''' path = test_dir / "exit_one" write_file(path, exit_one_script) return path
def path_to_sig_kill(test_dir): sig_kill_script = '''#!/bin/bash echo "sig_kill = 1" kill -KILL $$ ''' path = test_dir / "sig_kill" write_file(path, sig_kill_script) return path
def job_shell_file(test_dir, job_python_file, plugin_log_file): job_shell_file = test_dir / "debug.sh" contents = format_script(f""" #!/bin/bash exec {job_python_file} $@ &> {plugin_log_file} """) write_file(job_shell_file, contents) return job_shell_file
def path_to_exit_zero(test_dir): exit_zero_script = '''#!/bin/bash echo "exit_zero = 0" exit 0 ''' path = test_dir / "exit_zero" write_file(path, exit_zero_script) return path
def path_to_ordering_script(test_dir): script=""" #!/bin/bash cat a_file exit 0 """ path = test_dir / "ordering.sh" write_file(path, format_script(script)) return path
def condor(test_dir, slot_config, discovery_script, monitor_script): write_file(test_dir / "discovery.py", discovery_script) write_file(test_dir / "monitor.py", monitor_script) with Condor( local_dir=test_dir / "condor", config={**slot_config, "TEST_DIR": test_dir.as_posix()}, ) as condor: # try to make sure the monitor runs before we continue with the test time.sleep(MONITOR_PERIOD * 1.5) yield condor
def condor(test_dir, slot_config, discovery_script, monitor_script): write_file(test_dir / "discovery", discovery_script) write_file(test_dir / "monitor", monitor_script) with Condor( local_dir=test_dir / "condor", config={ **slot_config, "TEST_DIR": test_dir.as_posix() }, ) as condor: yield condor
def condor(test_dir, slot_config): for resource in resources.keys(): sequence = { f"{resource}{i}": j for i, j in enumerate(usages[resource]) } discovery_script = format_script(discovery_script_for(resource, sequence)) write_file(test_dir / f"{resource}-discovery.py", discovery_script) sequences = { f"{resource}{i}": j for i, j in enumerate(peaks[resource]) } monitor_script = both_monitor_script(resource, sequence, sequences) write_file(test_dir / f"{resource}-monitor.py", monitor_script) with Condor( local_dir=test_dir / "condor", config={**slot_config, "TEST_DIR": test_dir.as_posix()}, ) as condor: yield condor
def jobids_for_sleep_jobs(test_dir, condor, max_idle, max_materialize): sub_description = """ executable = /bin/sleep arguments = 10 request_memory = 1MB request_disk = 1MB max_materialize = {max_materialize} max_idle = {max_idle} queue {q} """.format( max_materialize=max_materialize, max_idle=max_idle, q=max_materialize + max_idle + 1, ) submit_file = write_file(test_dir / "queue.sub", sub_description) submit_cmd = condor.run_command(["condor_submit", submit_file]) clusterid, num_procs = parse_submit_result(submit_cmd) jobids = [JobID(clusterid, n) for n in range(num_procs)] condor.job_queue.wait_for_events( {jobid: [SetJobStatus(JobStatus.COMPLETED)] for jobid in jobids}, timeout=60 ) return jobids
def submit_dataflow_skip_job_cmd( test_dir, default_condor, finished_output_jobid, path_to_sleep, dataflow_input_file, dataflow_output_file, ): """ After submit_output_job_cmd() has completed, we now want to send a new job with an argument to skip if it's a dataflow job. """ sub_description = """ executable = {exe} arguments = 10 transfer_input_files = {input} transfer_output_files = {output} should_transfer_files = YES skip_if_dataflow = True log = {log} queue """.format( exe=path_to_sleep, input=dataflow_input_file, output=dataflow_output_file, log=test_dir / "submit" / "dataflow-skip.log", ) submit_file = write_file(test_dir / "submit" / "dataflow-skip.sub", sub_description) return default_condor.run_command(["condor_submit", submit_file])
def job_queue_events_for_sleep_job(test_dir, default_condor): sub_description = """ executable = /bin/sleep arguments = 10 queue """ submit_file = write_file(test_dir / "job.sub", sub_description) submit_cmd = default_condor.run_command(["condor_submit", submit_file]) clusterid, num_procs = parse_submit_result(submit_cmd) jobid = JobID(clusterid, 0) default_condor.job_queue.wait_for_events( { jobid: [ ( # when the job starts running, hold it SetJobStatus(JobStatus.RUNNING), lambda jobid, event: default_condor.run_command( ["condor_hold", jobid]), ), ( # once the job is held, release it SetJobStatus(JobStatus.HELD), lambda jobid, event: default_condor.run_command( ["condor_release", jobid]), ), SetJobStatus(JobStatus.COMPLETED), ] }, timeout=60, ) return default_condor.job_queue.by_jobid[jobid]
def peak_job(test_dir, resource): script_file = (test_dir / "poll-memory.py") write_file(script_file, peak_job_script(resource)) return { "executable": script_file.as_posix(), "arguments": "17", f"request_{resource}s": "1", "log": (test_dir / "events.log").as_posix(), "output": (test_dir / "poll-memory.$(Cluster).$(Process).out").as_posix(), "error": (test_dir / "poll-memory.$(Cluster).$(Process).err").as_posix(), "getenv": "true", "LeaveJobInQueue": "true", }
def clusterid_for_itemdata(test_dir, condor): # enable late materialization, but with a high enough limit that they all # show up immediately (on hold, because we don't need to actually run # the jobs to do the tests) sub_description = """ executable = /bin/sleep arguments = 0 request_memory = 1MB request_disk = 1MB max_materialize = 5 hold = true My.Foo = "$(Item)" queue in (A, B, C, D, E) """ submit_file = write_file(test_dir / "queue_in.sub", sub_description) submit_cmd = condor.run_command(["condor_submit", submit_file]) clusterid, num_procs = parse_submit_result(submit_cmd) jobids = [JobID(clusterid, n) for n in range(num_procs)] condor.job_queue.wait_for_events( {jobid: [SetAttribute("Foo", None)] for jobid in jobids}, timeout=10 ) yield clusterid condor.run_command(["condor_rm", clusterid])
def condor(test_dir, slot_config, discovery_script, monitor_script): write_file(test_dir / "discovery.py", discovery_script) write_file(test_dir / "monitor.py", monitor_script) with Condor( local_dir=test_dir / "condor", config={**slot_config, "TEST_DIR": test_dir.as_posix()}, ) as condor: # Ornithology will run condor_who to verify that all the daemons are running, # but occasionally, not all 16 slots will have made it to the collector loop_count = 0 while 16 != len(condor.status(ad_type=htcondor.AdTypes.Startd, projection=["SlotID"])): loop_count = loop_count + 1 assert(loop_count < 20) time.sleep(1) yield condor
def submit_sleep_job_cmd(test_dir, default_condor, path_to_sleep): sub_description = """ executable = {exe} arguments = 1 queue """.format(exe=path_to_sleep) submit_file = write_file(test_dir / "submit" / "job.sub", sub_description) return default_condor.run_command(["condor_submit", submit_file])
def path_to_the_job_script(test_dir): script = """ #!/usr/bin/python3 import sys import time import getopt total_steps = 24 num_completed_steps = 0 try: with open("saved-state", "r") as saved_state: num_completed_steps = int(saved_state.readline().strip()) except IOError: pass while num_completed_steps < total_steps: print(f"Starting step {num_completed_steps}.") time.sleep(3) num_completed_steps += 1 if num_completed_steps % 5 == 0: print(f"Checkpointing after {num_completed_steps}.") try: with open("saved-state", "w") as saved_state: saved_state.write(f"{num_completed_steps}") sys.exit(17) except IOError: print("Failed to write checkpoint.", file=sys.stderr); sys.exit(1) print(f"Completed all {total_steps} steps.") sys.exit(0) """ path = test_dir / "counting.py" write_file(path, format_script(script)) return path
def path_to_directory_script(default_condor, test_dir): script=f""" #!/bin/bash export CONDOR_CONFIG={default_condor.config_file} export PATH=$PATH:{os.environ["PATH"]} cat data/data_file >> {test_dir}/directory-test-file DATA=`cat data/data_file` if [[ $DATA == "first job modification" ]]; then echo "second job modification" > data/data_file exit 0 fi echo "first job modification" > data/data_file condor_vacate_job $1 # Don't exit before we've been vacated. sleep 60 exit 0 """ path = test_dir / "directory.sh" write_file(path, format_script(script)) return path
def path_to_prp_script(default_condor, test_dir): script=f""" #!/bin/bash export CONDOR_CONFIG={default_condor.config_file} export PATH=$PATH:{os.environ["PATH"]} DATA=`tail -n 1 {test_dir}/prp-test-file` echo "Starting up..." >> {test_dir}/prp-test-file if [[ $DATA == "" ]]; then echo "step one" >> {test_dir}/prp-test-file echo "step one" >> prp/data/data_file mkdir prp/data/subdir echo "step one" >> prp/data/subdir/other_data_file exit 85 fi if [[ $DATA == "step one" ]]; then echo "step two" >> {test_dir}/prp-test-file echo "step two" >> prp/data/data_file condor_vacate_job $1 # Don't exit before we've been vacated. sleep 60 # We did not succeed. exit 1 fi if [[ $DATA == "step two" ]]; then echo "step three" >> {test_dir}/prp-test-file echo "step three" >> prp/data/data_file exit 0 fi echo "step never-never" >> {test_dir}/prp-test-file echo "step never-never" >> prp/data/data_file exit 1 """ path = test_dir / "prp.sh" write_file(path, format_script(script)) return path
def path_to_job_two_script(test_dir): script = """ #!/usr/bin/python3 import sys import time nap = 0 nap_lengths = [5, 10, 15] try: with open("saved-state", "r") as saved_state: nap = int(saved_state.readline().strip()) print(f"Restarting naps from #{nap}") except IOError: pass print(f"Nap #{nap} will be {nap_lengths[nap]} seconds long.") time.sleep(nap_lengths[nap]) nap += 1 if nap >= len(nap_lengths): print(f"Completed all naps.") sys.exit(0) try: with open("saved-state", "w") as saved_state: saved_state.write(f"{nap}") sys.exit(17) except IOError: print("Failed to write checkpoint.", file=sys.stderr); sys.exit(1) """ path = test_dir / "counting.py" write_file(path, format_script(script)) return path
def submit_output_job_cmd(test_dir, default_condor, dataflow_input_file, dataflow_output_file): # Start by creating an input file input_description = "dataflow-input" input_file = write_file(dataflow_input_file, input_description) # First job in our workflow outputs a basic file to disk. # Later jobs will compare timestamps of input and output to determine if dataflow. executable_description = """#!/bin/sh echo "dataflow-output" > {output} """.format(output=dataflow_output_file) executable_file = write_file(test_dir / "submit" / "dataflow-output.sh", executable_description) sub_description = """ executable = {exe} queue """.format(exe=test_dir / "submit" / "dataflow-output.sh") submit_file = write_file(test_dir / "submit" / "dataflow-output.sub", sub_description) return default_condor.run_command(["condor_submit", submit_file])
def test_script(test_dir, test_script_contents): test_script = test_dir / "test_script.py" write_file(test_script, test_script_contents) return test_script
def job_python_file(test_dir): job_python_file = test_dir / "debug.py" contents = format_script(""" #!/usr/bin/python3 import classad import json import os import posixpath import shutil import socket import sys import time from urllib.parse import urlparse DEFAULT_TIMEOUT = 30 PLUGIN_VERSION = '1.0.0' EXIT_SUCCESS = 0 EXIT_FAILURE = 1 EXIT_AUTHENTICATION_REFRESH = 2 def print_help(stream = sys.stderr): help_msg = '''Usage: {0} -infile <input-filename> -outfile <output-filename> {0} -classad Options: -classad Print a ClassAd containing the capablities of this file transfer plugin. -infile <input-filename> Input ClassAd file -outfile <output-filename> Output ClassAd file -upload Indicates this transfer is an upload (default is download) ''' stream.write(help_msg.format(sys.argv[0])) def print_capabilities(): capabilities = { 'MultipleFileSupport': True, 'PluginType': 'FileTransfer', 'SupportedMethods': 'debug', 'Version': PLUGIN_VERSION, } sys.stdout.write(classad.ClassAd(capabilities).printOld()) def parse_args(): # The only argument lists that are acceptable are # <this> -classad # <this> -infile <input-filename> -outfile <output-filename> # <this> -outfile <output-filename> -infile <input-filename> if not len(sys.argv) in [2, 5, 6]: print_help() sys.exit(EXIT_FAILURE) # If -classad, print the capabilities of the plugin and exit early if (len(sys.argv) == 2) and (sys.argv[1] == '-classad'): print_capabilities() sys.exit(EXIT_SUCCESS) # If -upload, set is_upload to True and remove it from the args list is_upload = False if '-upload' in sys.argv[1:]: is_upload = True sys.argv.remove('-upload') # -infile and -outfile must be in the first and third position if not ( ('-infile' in sys.argv[1:]) and ('-outfile' in sys.argv[1:]) and (sys.argv[1] in ['-infile', '-outfile']) and (sys.argv[3] in ['-infile', '-outfile']) and (len(sys.argv) == 5)): print_help() sys.exit(1) infile = None outfile = None try: for i, arg in enumerate(sys.argv): if i == 0: continue elif arg == '-infile': infile = sys.argv[i+1] elif arg == '-outfile': outfile = sys.argv[i+1] except IndexError: print_help() sys.exit(EXIT_FAILURE) return {'infile': infile, 'outfile': outfile, 'upload': is_upload} def format_error(error): return '{0}: {1}'.format(type(error).__name__, str(error)) def get_error_dict(error, url = ''): error_string = format_error(error) error_dict = { 'TransferSuccess': False, 'TransferError': error_string, 'TransferUrl': url, } return error_dict class DebugPlugin: # Extract whatever information we want from the url provided. # In this example, convert the example://path/to/file url to a # path in the file system (ie. /path/to/file) def parse_url(self, url): url_path = url[(url.find("://") + 3):] return url_path def download_file(self, url, local_file_path): start_time = time.time() # Download transfer logic goes here print(f"DEBUG: download {url} -> {local_file_path}") file_size = 0 end_time = time.time() # Get transfer statistics transfer_stats = { 'TransferSuccess': True, 'TransferProtocol': 'example', 'TransferType': 'upload', 'TransferFileName': local_file_path, 'TransferFileBytes': file_size, 'TransferTotalBytes': file_size, 'TransferStartTime': int(start_time), 'TransferEndTime': int(end_time), 'ConnectionTimeSeconds': end_time - start_time, 'TransferUrl': url, } return transfer_stats def upload_file(self, url, local_file_path): start_time = time.time() # Upload transfer logic goes here print(f"DEBUG: upload {local_file_path} --> {url}") file_size = 0 end_time = time.time() # Get transfer statistics transfer_stats = { 'TransferSuccess': True, 'TransferProtocol': 'example', 'TransferType': 'upload', 'TransferFileName': local_file_path, 'TransferFileBytes': file_size, 'TransferTotalBytes': file_size, 'TransferStartTime': int(start_time), 'TransferEndTime': int(end_time), 'ConnectionTimeSeconds': end_time - start_time, 'TransferUrl': url, } return transfer_stats if __name__ == '__main__': # Start by parsing input arguments try: args = parse_args() except Exception: sys.exit(EXIT_FAILURE) debug_plugin = DebugPlugin() # Parse in the classads stored in the input file. # Each ad represents a single file to be transferred. try: infile_ads = classad.parseAds(open(args['infile'], 'r')) except Exception as err: try: with open(args['outfile'], 'w') as outfile: outfile_dict = get_error_dict(err) outfile.write(str(classad.ClassAd(outfile_dict))) except Exception: pass sys.exit(EXIT_FAILURE) # Now iterate over the list of classads and perform the transfers. try: with open(args['outfile'], 'w') as outfile: for ad in infile_ads: try: if not args['upload']: outfile_dict = debug_plugin.download_file(ad['Url'], ad['LocalFileName']) else: outfile_dict = debug_plugin.upload_file(ad['Url'], ad['LocalFileName']) outfile.write(str(classad.ClassAd(outfile_dict))) except Exception as err: try: outfile_dict = get_error_dict(err, url = ad['Url']) outfile.write(str(classad.ClassAd(outfile_dict))) except Exception: pass sys.exit(EXIT_FAILURE) except Exception: sys.exit(EXIT_FAILURE) """) write_file(job_python_file, contents) return job_python_file