def main(): histogram = sys.argv[1] metadata_file = histogram + '.metadata' metadata_txt = get_histogram_metadata(histogram) run_cmd("echo '%s' > '%s'" % (metadata_txt, metadata_file)) print("Created metadata file %s with contents: \n%s" % (metadata_file, metadata_txt))
def run(self): """Runs all Ntuple production jobs -- either locally or on the batch system. """ record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out, DEPENDENCIES) run_cmd( "make -f %s -j %i 2>%s 1>%s" % \ (self.makefile, self.num_parallel_jobs, self.stderr_file_path, self.stdout_file_path), False )
def get_scratch_dir(self): scratch_dir = "/scratch/%s" % getpass.getuser() if not os.path.exists(scratch_dir): print "Directory '%s' does not yet exist, creating it !!" % scratch_dir run_cmd(command_create_scratchDir) scratch_dir = os.path.join( scratch_dir, "tthAnalysis" + "_" + date.today().isoformat() ) create_if_not_exists(scratch_dir) return scratch_dir
def main(): input_files = sys.argv[1:len(sys.argv)] print("<check_that_histograms_are_valid.py>: input files = '%s'" % " ".join(input_files)) run_cmd('sleep 20') for input_file in input_files: check_that_histogram_is_valid(input_file) print("All input files are ok.") sys.exit(0)
def submit_job_version2( self, task_name=None, command=None, output_dir=None ): ''' This method is similar to submitJob, but has less required parameters. Supports multiple lines of Bash commands instead of fixed oneliner. ''' print("SBatchManager#hadd_on_cluster_node(task_name=%s, command=%s, output_dir=%s)" % ( task_name, command, command)) if not self.workingDir: raise ValueError( "Please call 'setWorkingDir' before calling 'submitJob' !!") scratch_dir = self.get_scratch_dir() # Create script for executing jobs script_file = output_dir + "/cfgs/" + task_name + ".sh" wrapper_log_file = output_dir + "/logs/" + task_name + "_wrapper.log" executable_log_file = output_dir + "/logs/" + task_name + "_executable.log" run_cmd("mkdir -p '%s'" % (output_dir + "/cfgs/")) run_cmd("mkdir -p '%s'" % (output_dir + "/logs/")) sbatch_command = "%s --partition=%s --output=%s %s" % ( self.command_submit, # "sbatch" self.queue, wrapper_log_file, script_file ) script = jinja2.Template(submit_job_version2_template).render( command=command, working_dir=self.workingDir, scratch_dir=scratch_dir, wrapper_log_file=wrapper_log_file, executable_log_file=executable_log_file, sbatch_command=sbatch_command ) print "writing sbatch script file = '%s'" % script_file with codecs.open(script_file, "w", "utf-8") as f: f.write(script) # Run command sbatch_command_result = run_cmd(sbatch_command) job_id = sbatch_command_result.split()[-1] self.jobIds.append(job_id)
def get_job_dir(self): if self.use_home: prefix = os.path.join('/home', getpass.getuser(), 'jobs') else: prefix = os.path.join('/scratch', getpass.getuser()) if not hdfs.isdir(prefix): run_cmd('/scratch/mkscratch') job_dir = os.path.join( prefix, "%s_%s" % (self.analysisName, datetime.date.today().isoformat()), ) return job_dir
def get_scratch_dir(self): scratch_dir = "/scratch/%s" % getpass.getuser() if not os.path.exists(scratch_dir): logging.info("Directory '%s' does not yet exist, creating it !!" % scratch_dir) run_cmd(command_create_scratchDir) scratch_dir = os.path.join( scratch_dir, "%s_%s" % (self.analysisName, datetime.date.today().isoformat()), ) create_if_not_exists(scratch_dir) return scratch_dir
def executable_hadd_in_cluster_spec(): # Prepare run_cmd("rm -rf %(temp_dir)s/executable_hadd_in_cluster_spec/*" % config) run_cmd("mkdir -p %(temp_dir)s/executable_hadd_in_cluster_spec/" % config) run_cmd("""echo "%(fixtures_dir)s/histogram_1.root\n%(fixtures_dir)s/histogram_2.root\n" > """ \ """%(temp_dir)s/executable_hadd_in_cluster_spec/input_histograms_list.txt""" % config) # Run task run_cmd('python %(scripts_dir)s/hadd_in_cluster.py ' \ '%(temp_dir)s/executable_hadd_in_cluster_spec/output_histogram.root ' \ '%(temp_dir)s/executable_hadd_in_cluster_spec/input_histograms_list.txt' % config) # Check the result root_result_file = '%(temp_dir)s/executable_hadd_in_cluster_spec/output_histogram.root' % config result_successful = os.path.isfile(root_result_file) # Output result if result_successful: print('PASSED: Executable for HADD in cluster is WORKING') else: print('FAILED: Executable for HADD in cluster is NOT WORKING') return result_successful
def executable_hadd_in_cluster_spec(): # Prepare run_cmd("rm -rf /home/%(user)s/tmp/executable_hadd_in_cluster_spec/*" % config) run_cmd("mkdir -p /home/%(user)s/tmp/executable_hadd_in_cluster_spec/" % config) fixtures_dir = '/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/specification/fixtures/' % config run_cmd("""echo "%(fixtures_dir)s/histogram_1.root\n%(fixtures_dir)s/histogram_2.root\n" > /home/%(user)s/tmp/executable_hadd_in_cluster_spec/input_histograms_list.txt""" % { 'fixtures_dir': fixtures_dir, 'user': config['user'] }) # Run task run_cmd('python /home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/scripts/hadd_in_cluster.py /home/%(user)s/tmp/executable_hadd_in_cluster_spec/output_histogram.root /home/%(user)s/tmp/executable_hadd_in_cluster_spec/input_histograms_list.txt' % config) # Check the result root_result_file = '/home/%(user)s/tmp/executable_hadd_in_cluster_spec/output_histogram.root' % config result_successful = os.path.isfile(root_result_file) # Output result if result_successful: print('Executable for HADD in cluster is WORKING') else: print('Executable for HADD in cluster is NOT WORKING') return result_successful
def call_histogram_aggregation_on_cluster_node_spec(): # Prepare run_cmd("rm -rf %(temp_dir)s/call_histogram_aggregation_on_cluster_node" % config) run_cmd("mkdir -p %(temp_dir)s/call_histogram_aggregation_on_cluster_node/" % config) # Add histograms and run task pool_id = uuid.uuid4() m = sbatchManager(pool_id) m.setWorkingDir('%(cmssw_base)s/src/analysis2mu1b1j/analysis2mu1b1j/test' % config) try: m.hadd_in_cluster( inputFiles=[ '%(fixtures_dir)s/histogram_1.root' % config, '%(fixtures_dir)s/histogram_2.root' % config ], outputFile='%(temp_dir)s/call_histogram_aggregation_on_cluster_node/result.root' % config ) m.waitForJobs() except: return False # Check result root_result_file = '%(temp_dir)s/call_histogram_aggregation_on_cluster_node/result.root' % config root_file_exists = os.path.isfile(root_result_file) if not root_file_exists: print('FAILED: HADD on cluster node failed - file is missing') return False histogram_metadata_file = root_result_file + '.metadata' root_file_metadata_txt = run_cmd('cat %s' % histogram_metadata_file) expected_metadata_txt = "events_count: 3629292.0" if root_file_metadata_txt.find(expected_metadata_txt) == -1: print('FAILED: Metadata "%s" is not correct, should be "%s"' % (root_file_metadata_txt, expected_metadata_txt)) return False print('PASSED: HADD on cluster node worked') return True
def check_that_histograms_are_valid_with_invalid_metadata(): # Prepare histogram_with_invalid_metadata = "%(fixtures_dir)s/histogram_with_invalid_metadata.root" % config histograms = [ "%(fixtures_dir)s/histogram_1.root" % config, histogram_with_invalid_metadata ] # Run task command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config command_arguments = " ".join(histograms) command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;" command_output = run_cmd(command_with_arguments) # Check result expected_error_message = 'ERROR: real metadata does not match expected metadata for histogram: %s' % histogram_with_invalid_metadata if command_output.find(expected_error_message) == -1: print('Output must contain information that metadata does not match') return False if command_output.find('EXIT_STATUS_WAS: 1') == -1: print('Exit status must be 1 if metadata does not match') return False return True
def waitForJobs(self): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ numJobs = len(self.jobIds) # print "<waitForJobs>: numJobs = %i" % numJobs if numJobs > 0: jobIds_per_poll_group = 500 num_poll_groups = numJobs / jobIds_per_poll_group if (numJobs % jobIds_per_poll_group) > 0: num_poll_groups = num_poll_groups + 1 whoami = getpass.getuser() while True: numJobs_left = 0 for idx_poll_group in range(num_poll_groups): idx_first = idx_poll_group * jobIds_per_poll_group idx_last = min((idx_poll_group + 1) * jobIds_per_poll_group, numJobs) jobIds_poll_group = self.jobIds[idx_first:idx_last] command = "%s -u %s | grep \"%s\" | wc -l" % ( self.command_poll, whoami, "\\|".join(jobIds_poll_group)) # print "idx_poll_group = %i: command = %s" % # (idx_poll_group, command) poll_result = run_cmd(command, True).rstrip("\n") # print " poll_result = %s" % poll_result numJobs_left = numJobs_left + int(poll_result) time.sleep(1) # print "numJobs_left = %i" % numJobs_left if numJobs_left > 0: time.sleep(self.poll_interval) else: break logging.info( "Waiting for sbatch to finish (%d jobs still left) ..." % numJobs_left)
def check_that_histograms_are_valid_spec(): # Prepare valid_histograms = [ "%(fixtures_dir)s/histogram_1.root" % config, "%(fixtures_dir)s/histogram_2.root" % config ] # Run task command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config command_arguments = " ".join(valid_histograms) command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;" result = run_cmd(command_with_arguments) # Check result if result.find('All input files are ok.') == -1: print('Result must contain string "All input files are ok."') return False if result.find('EXIT_STATUS_WAS: 0') == -1: print('Exit status was not 0') return False return True
def check_that_histograms_are_valid_with_too_small_root_file_spec(): # Prepare too_small_histogram = "%(fixtures_dir)s/histogram_too_small.root" % config histograms = [ "%(fixtures_dir)s/histogram_1.root" % config, too_small_histogram ] # Run task command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config command_arguments = " ".join(histograms) command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;" result = run_cmd(command_with_arguments) # Check result if result.find('ERROR: root input file is too small (2 bytes): %s' % too_small_histogram) == -1: print('Ouput must contain error information what file was too small') return False if result.find('EXIT_STATUS_WAS: 1') == -1: print('Exit status must be 1 if file was too small') return False return True
def check_that_histograms_are_valid_with_missing_input_histogram_spec(): # Prepare missing_histogram = "%(fixtures_dir)s/histogram_THIS_DOES_NOT_EXIST.root" % config histograms = [ "%(fixtures_dir)s/histogram_1.root" % config, missing_histogram ] # Run task command = 'python %(scripts_dir)s/check_that_histograms_are_valid.py' % config command_arguments = " ".join(histograms) command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;" result = run_cmd(command_with_arguments) # Check result if result.find('ERROR: root input file is missing: %s' % missing_histogram) == -1: print('Ouput must contain error information what file was missing') return False if result.find('EXIT_STATUS_WAS: 1') == -1: print('Exit status must be 1 if file was missing') return False return True
def check_that_histograms_are_equal_with_unequal_data(): # Prepare output_histogram = "%(fixtures_dir)s/hadd_of_histogram_1_and_broken.root" % config input_histograms = [ "%(fixtures_dir)s/histogram_1.root" % config, "%(fixtures_dir)s/histogram_2.root" % config ] # Run task command = 'python %(scripts_dir)s/check_that_histograms_are_equal.py' % config command_arguments = output_histogram + " " + " ".join(input_histograms) command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;" result = run_cmd(command_with_arguments) # Check result if result.find('ERROR: count(output_histogram.events) != count(input_histograms.events)') == -1: print('Result must contain string "ERROR: count(output_histogram.events) != count(input_histograms.events)"') return False if result.find('EXIT_STATUS_WAS: 1') == -1: print('Exit status was not 1') return False return True
def check_that_histograms_are_equal_spec(): # Prepare output_histogram = "%(fixtures_dir)s/hadd_of_histogram_1_and_2.root" % config input_histograms = [ "%(fixtures_dir)s/histogram_1.root" % config, "%(fixtures_dir)s/histogram_2.root" % config ] # Run task command = 'python %(scripts_dir)s/check_that_histograms_are_equal.py' % config command_arguments = output_histogram + " " + " ".join(input_histograms) command_with_arguments = command + " " + command_arguments + "; echo EXIT_STATUS_WAS: $?;" result = run_cmd(command_with_arguments) # Check result if result.find( 'Output histogram event count is same as input histograms event counts sum' ) == -1: print( 'Result must contain string "Output histogram event count is same as input histograms event counts sum"' ) return False if result.find('EXIT_STATUS_WAS: 0') == -1: print('Exit status was not 0') return False return True
def submit(self, cmd_str): nof_max_retries = 10 current_retry = 0 while current_retry < nof_max_retries: # Run command cmd_outerr = run_cmd(cmd_str, return_stderr=True) try: job_id = cmd_outerr[0].split()[-1] break except IndexError: # Fails if stdout returned by the last line is empty logging.warning( "Caught an error: '%s'; resubmitting %i-th time" % (cmd_outerr[1], current_retry)) current_retry += 1 logging.debug("sleeping for %i seconds." % 60) time.sleep( 60 ) # Let's wait for 60 seconds until the next resubmission # The job ID must be a number, so.. we have to check if it really is one try: int(job_id) except ValueError: raise ValueError("job_id = '%s' NaN; sbatch stdout = '%s'; sbatch stderr = '%s'" % \ (job_id, cmd_outerr[0], cmd_outerr[1])) if job_id in self.submittedJobs: raise RuntimeError("Same job ID: %s" % job_id) # Is a valid job ID return job_id
def check_that_histogram_is_ready_for_usage(input_file): print("<check_that_histogram_is_ready_for_usage>: input file = '%s'" % input_file) polling_delay = 1 # in seconds polling_cmd = "fuser %s" % input_file is_file_ready = False while not is_file_ready: stdout, stderr = run_cmd(polling_cmd, return_stderr = True) print("Executed command '%s':" % polling_cmd) print("stdout = '%s'" % stdout) print("stderr = '%s'" % stderr) if not stdout and not stderr: # No one uses this file, it's free to use for everyone break if not stdout and stderr: # The file still doesn't exist? print(stderr.rstrip('\n')) sys.exit(1) else: # Both stdout and stderr contain text (PID and filename, respectively); wait ... time.sleep(polling_delay)
def execute_command_on_cluster_node_spec(): # Prepare run_cmd("rm -rf /%(temp_dir)s/execute_command_on_cluster_node_spec/*" % config) # Run task pool_id = uuid.uuid4() m = sbatchManager(pool_id) m.setWorkingDir('%(cmssw_base)s/src/analysis2mu1b1j/analysis2mu1b1j/test' % config) m.submit_job_version2( task_name='creating_result.txt', # BUG: Task name can't include space command=''' export TEST_DIR=%(temp_dir)s/execute_command_on_cluster_node_spec/ mkdir -p $TEST_DIR echo "Worked" > $TEST_DIR/result.txt ''' % config, output_dir='%(temp_dir)s/execute_command_on_cluster_node_spec/' % config) # Check the result try: m.waitForJobs() except: got_exception = True else: got_exception = False if got_exception: return False with file('%(temp_dir)s/execute_command_on_cluster_node_spec/result.txt' % config) as f: result = f.read().strip() if result != 'Worked': print( "$TEST_DIR/ did not contain result.txt with content 'Worked'.") print('FAILED: Execute on cluster node failed.') return False return True
def is_file_ok(output_file_name, validate_outputs=True, min_file_size=20000): if not (output_file_name and os.path.exists(output_file_name)): return False logging.info("Output file %s already exists" % output_file_name) if not output_file_name.lower().endswith('.root'): return True command = "rm %s" % output_file_name ret_value = False if min_file_size > 0: output_file_size = os.stat(output_file_name).st_size if output_file_size > min_file_size: if not validate_outputs: ret_value = True else: logging.info( "Deleting output file and resubmitting job because it has size smaller than %d bytes" % min_file_size) if validate_outputs: root_tfile = ROOT.TFile(output_file_name, "read") if not root_tfile: logging.info("Not a valid ROOT file, deleting it") else: if root_tfile.IsZombie(): logging.info( "Output file is corrupted, deleting file and resubmitting job" ) else: # Let's open the file via bash as well to see if ROOT tries to recover the file open_cmd = "root -b -l -q %s 2>&1 > /dev/null | grep 'trying to recover' | wc -l" % output_file_name open_out = run_cmd(open_cmd) if open_out.rstrip('\n') != '0': logging.info( "Output file is probably corrupted, deleting file and resubmitting job" ) else: ret_value = True root_tfile.Close() if not ret_value: run_cmd(command) return ret_value
def run(self, clean): record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out, DEPENDENCIES) target = 'all' if clean: if not os.path.isfile(self.makefile_path): logging.error( "The makefile %s is missing and therefore it's not possible to clean anything; " "run sync Ntuple production first!" % self.makefile_path ) sys.exit(1) target = 'clean' nof_parallel_jobs = len(self.channel_info) make_cmd = "make -f %s -j %d %s 2>%s 1>%s" % \ (self.makefile_path, nof_parallel_jobs, target, self.stderr_file_path, self.stdout_file_path) logging.info("Running the make command: %s" % make_cmd) run_cmd(make_cmd) logging.info("All done")
def generate_sbatch_line(executable, cfg_file_name, input_file_names, output_file_name, log_file_name = None, cvmfs_error_log = None): if os.path.exists(output_file_name): output_file_size = os.stat(output_file_name).st_size print "output file %s already exists, size = %i" % (output_file_name, output_file_size) if output_file_size > 20000: print "--> skipping job because it has size creater than 20000" return None else: print "--> deleting output file and resubmitting job because it has size smaller 20000" command = "%s %s" % (executable_rm, output_file_name) run_cmd(command) if log_file_name and os.path.exists(log_file_name): log_file = open(log_file_name) is_time = False time = None is_hostname = False hostname = None is_cvmfs_error = False for line in log_file: if line.find("Time") != -1: time = line.split(':')[1].strip() if line.find("Hostname") != -1: hostname = line.split(':')[1].strip() if line.find("Transport endpoint is not connected") != -1: is_cvmfs_error = True log_file.close() if is_cvmfs_error: print "Problem with cvmfs access reported in log file = '%s':" % log_file_name print " host = '%s': time = %s" % (hostname, time) if cvmfs_error_log: if not hostname in cvmfs_error_log.keys(): cvmfs_error_log[hostname] = [] cvmfs_error_log[hostname].append(time) return "m.submitJob(%s, '%s', '%s', '%s', %s, '%s', True)" % ( input_file_names, executable, cfg_file_name, os.path.dirname(output_file_name), [ os.path.basename(output_file_name) ], log_file_name )
def check_that_metadata_is_ok(input_file): print("<check_that_metadata_is_ok>: input file = '%s'" % input_file) metadata_file = input_file + '.metadata' expected_metadata_txt = run_cmd('cat %s' % metadata_file) real_metadata_txt = get_histogram_metadata(input_file) if real_metadata_txt.find(expected_metadata_txt) == -1: print("ERROR: Metadata for input file '%s' does not match expected value !!" % input_file) print("computed metadata = '%s'" % real_metadata_txt) print("expected metadata = '%s'" % expected_metadata_txt) sys.exit(1)
def run(self, clean): record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out, DEPENDENCIES) target = 'all' if clean: if not os.path.isfile(self.makefile_path): logging.error( "The makefile %s is missing and therefore it's not possible to clean anything; " "run sync Ntuple production first!" % self.makefile_path) sys.exit(1) target = 'clean' nof_parallel_jobs = len(self.channel_info) make_cmd = "make -f %s -j %d %s 2>%s 1>%s" % \ (self.makefile_path, nof_parallel_jobs, target, self.stderr_file_path, self.stdout_file_path) if self.running_method.lower() == "makefile": run_dir = re.sub('^/home', '/scratch', self.config_dir) create_if_not_exists(run_dir) make_cmd = re.sub('^make', 'make -C {}'.format(run_dir), make_cmd) logging.info("Running the make command: %s" % make_cmd) run_cmd(make_cmd) logging.info("All done")
def call_histogram_aggregation_on_cluster_node_spec(): # Prepare run_cmd("rm -rf /home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node" % config) run_cmd("mkdir -p /home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node/" % config) # Add histograms and run task m = sbatchManager() m.setWorkingDir('/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/analysis2mu1b1j/analysis2mu1b1j/test' % config) m.hadd_in_cluster( inputFiles=[ '/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/specification/fixtures/histogram_1.root' % config, '/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/tthAnalysis/HiggsToTauTau/specification/fixtures/histogram_2.root' % config ], outputFile='/home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node/result.root' % config ) m.waitForJobs() # Check result root_result_file = '/home/%(user)s/tmp/call_histogram_aggregation_on_cluster_node/result.root' % config result_successful = os.path.isfile(root_result_file) # Output result if result_successful: print('HADD on cluster node worked') else: print('HADD on cluster node failed') return result_successful
def execute_command_on_cluster_node_spec(): # Prepare run_cmd("rm -rf /home/%(user)s/tmp/execute_command_on_cluster_node_spec/*" % config) # Run task m = sbatchManager() m.setWorkingDir('/home/%(user)s/VHbbNtuples_7_6_x/CMSSW_7_6_3/src/analysis2mu1b1j/analysis2mu1b1j/test' % config) m.submit_job_version2( task_name = 'creating_result.txt', # BUG: Task name can't include space command = ''' export TEST_DIR=/home/%(user)s/tmp/execute_command_on_cluster_node_spec/ mkdir -p $TEST_DIR echo "Worked" > $TEST_DIR/result.txt ''' % config, output_dir = '/home/%(user)s/tmp/execute_command_on_cluster_node_spec/' % config ) m.waitForJobs() # Check the result with file('/home/%(user)s/tmp/execute_command_on_cluster_node_spec/result.txt' % config) as f: result = f.read().strip() if result == 'Worked': print('Execute on cluster node passed.') return True print("$TEST_DIR/ did not contain result.txt with content 'Worked'.") print('Execute on cluster node failed.') return False
def get_histogram_metadata(histogram): sha1sum = run_cmd('sha1sum %s' % histogram).split(' ')[0] events_count = get_events_count(histogram) metadata = """sha1sum: %s\nevents_count: %i\n""" % (sha1sum, events_count) return metadata
sh = jinja2.Template(sh_str).render(cmd=cmd) sh_file = os.path.join(args.generate_jobs, 'job_%i.sh' % path_idx) with open(sh_file, 'w') as f: f.write(sh) log_file = os.path.join(args.generate_jobs, 'log_%i.txt' % path_idx) job_params.append((log_file, sh_file)) # submit the jobs submit_cmds = list( map( lambda job_param: 'sbatch --partition=small --output=%s %s' % job_param, job_params)) squeue_codes = [] for submit_cmd in submit_cmds: squeue_code = run_cmd(submit_cmd).split()[-1] squeue_codes.append(squeue_code) logging.info( "Submitted sbatch job {jobId}".format(jobId=squeue_code)) has_completed = not bool(squeue_codes) while not has_completed: squeue = run_cmd("squeue -j {jobIds} -h | wc -l".format( jobIds=','.join(squeue_codes))).rstrip('\n') if squeue == '0': has_completed = True logging.debug( "{nofJobs} job(s) still running...".format(nofJobs=squeue)) time.sleep(5) logging.info("All jobs have been finished")
def run(self): """Runs all Ntuple production jobs -- either locally or on the batch system. """ run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs), False, self.stdout_file, self.stderr_file)
def run(self): """Runs all Ntuple production jobs -- either locally or on the batch system. """ run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs), False, self.stdout_file, self.stderr_file)
def poll(self, nonBlocking): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ text_line = '-' * 120 # Set a delimiter, which distinguishes entries b/w different jobs delimiter = ',' # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length): # 1) squeue -h -u {{user}} -o '%i %256k' # Collects the list of running jobs # a) -h omits header # b) -u {{user}} looks only for jobs submitted by {{user}} # c) -o '%i %256k' specifies the output format # i) %i -- job ID (1st column) # ii) %256k -- comment with width of 256 characters (2nd column) # If the job has no comments, the entry simply reads (null) # 2) grep {{comment}} # Filter the jobs by the comment which must be unique per sbatchManager instance at all times # 3) awk '{print $1}' # Filter only the jobIds out # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g' # Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read) command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \ "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'" command = jinja2.Template(command_template).render( user=self.user, pool_id_length=self.max_pool_id_length, comment=self.pool_id, delimiter=delimiter) # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes # even if some of them have already finished jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) while nofJobs_left > 0: # Get the list of jobs submitted to batch system and convert their jobIds to a set poll_result, poll_result_err = '', '' while True: poll_result, poll_result_err = run_cmd(command, do_not_log=False, return_stderr=True) if not poll_result and poll_result_err: logging.warning( 'squeue caught an error: {squeue_error}'.format( squeue_error=poll_result_err)) else: break # sleep a minute and then try again # in principle we could limit the number of retries, but hopefully that's not necessary logging.debug("sleeping for %i seconds." % 60) time.sleep(60) polled_ids = set() if poll_result != '': polled_ids = set(poll_result.split(delimiter)) # Check if number of jobs submitted to batch system is below maxSubmittedJobs; # if it is, take jobs from queuedJobs list and submit them, # until a total of maxSubmittedJobs is submitted to batch system nofJobs_toSubmit = min(len(self.queuedJobs), self.maxSubmittedJobs - len(polled_ids)) if nofJobs_toSubmit > 0: logging.debug( "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs." .format(len(polled_ids), len(self.queuedJobs), nofJobs_toSubmit)) else: logging.debug( "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing." .format(len(polled_ids), len(self.queuedJobs))) for i in range(0, nofJobs_toSubmit): # randomly submit a job from the queue two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_idx = len(self.queuedJobs) - 1 random_idx = random.randint(0, max_idx) job = self.queuedJobs.pop(random_idx) job['status'] = Status.submitted job_id = self.submit(job['sbatch_command']) self.submittedJobs[job_id] = job # Now check status of jobs submitted to batch system: # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of # jobs that have finished already finished_ids = list(jobIds_set - polled_ids) # Do not poll anything if currently there are no finished jobs if finished_ids: # Based on job's exit code what if the job has failed or completed successfully # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here # Therefore, we want to restrict the output by grepping specific job IDs # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable, # which is of order 2e6 # This means that we have to split the job IDs into chunks each of which we have to check separately finished_ids_chunks = [ finished_ids[i:i + self.max_nof_greps] for i in range(0, len(finished_ids), self.max_nof_greps) ] for finished_ids_chunk in finished_ids_chunks: completion = self.check_job_completion(finished_ids_chunk) completed_jobs, running_jobs, failed_jobs = [], [], [] for job_id, details in completion.iteritems(): if details.status == Status.completed: completed_jobs.append(job_id) elif details.status == Status.running: running_jobs.append(job_id) else: failed_jobs.append(job_id) # If there are any failed jobs, throw if failed_jobs: failed_jobs_str = ','.join(failed_jobs) errors = [ completion[job_id].status for job_id in failed_jobs ] logging.error( "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}" .format( jobIds=failed_jobs_str, reasons=', '.join(map(Status.toString, errors)), )) # Let's print a table where the first column corresponds to the job ID # and the second column lists the exit code, the derived exit code, the status # and the classification of the failed job logging.error("Error table:") for job_id in failed_jobs: sys.stderr.write( "{jobId} {exitCode} {derivedExitCode} {state} {status}\n" .format( jobId=job_id, exitCode=completion[job_id].exit_code, derivedExitCode=completion[job_id]. derived_exit_code, state=completion[job_id].state, status=Status.toString( completion[job_id].status), )) sys.stderr.write('%s\n' % text_line) for failed_job in failed_jobs: for log in zip(['wrapper', 'executable'], ['log_wrap', 'log_exec']): logfile = self.submittedJobs[failed_job][ log[1]] if os.path.isfile(logfile): logfile_contents = open(logfile, 'r').read() else: logfile_contents = '<file is missing>' sys.stderr.write( 'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n' .format( id=failed_job, description=log[0], path=logfile, log=logfile_contents, line=text_line, )) if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \ completion[failed_job].status == Status.io_error: # The job is eligible for resubmission if the job hasn't been resubmitted more # than a preset limit of resubmissions AND if the job failed due to I/O errors logging.warning( "Job w/ ID {id} and arguments {args} FAILED because: {reason} " "-> resubmission attempt #{attempt}". format( id=failed_job, args=self.submittedJobs[failed_job] ['args'], reason=Status.toString( completion[failed_job].status), attempt=self.submittedJobs[failed_job] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[failed_job]['args']) # The old ID must be deleted, b/c otherwise it would be used to compare against # squeue output and we would resubmit the failed job ad infinitum del self.submittedJobs[failed_job] else: # We've exceeded the maximum number of resubmissions -> fail the workflow raise Status.raiseError( completion[failed_job].status) else: logging.debug( "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}" .format( completedIds=','.join(completed_jobs), runningInfo='(%s still running)' % ','.join(running_jobs) if running_jobs else '', )) # Mark successfully finished jobs as completed so that won't request their status code again # Otherwise they will be still at ,,submitted'' state for job_id in completed_jobs: if not all( map( lambda outputFile: is_file_ok( outputFile, validate_outputs=True, min_file_size=self.min_file_size), self .submittedJobs[job_id]['outputFiles'])): if self.submittedJobs[job_id][ 'nof_submissions'] < self.max_resubmissions: logging.warning( "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file " "-> resubmission attempt #{attempt}". format( id=job_id, args=self.submittedJobs[job_id] ['args'], attempt=self.submittedJobs[job_id] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[job_id]['args']) del self.submittedJobs[job_id] else: raise ValueError( "Job w/ ID {id} FAILED because it repeatedly produces bogus output " "file {output} yet the job still exits w/o any errors" .format( id=job_id, output=', '.join( self.submittedJobs[job_id] ['outputFiles']), )) else: # Job completed just fine self.submittedJobs[job_id][ 'status'] = Status.completed jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) logging.info( "Waiting for sbatch to finish (%d job(s) still left) ..." % nofJobs_left) if nofJobs_left > 0: if nonBlocking: return False two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_delay = 300 random_delay = random.randint(0, max_delay) logging.debug("sleeping for %i seconds." % random_delay) time.sleep(self.poll_interval + random_delay) else: break return True
def waitForJobs(self): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ text_line = '-' * 120 # Set a delimiter, which distinguishes entries b/w different jobs delimiter = ',' # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length): # 1) squeue -h -u {{user}} -o '%i %256k' # Collects the list of running jobs # a) -h omits header # b) -u {{user}} looks only for jobs submitted by {{user}} # c) -o '%i %256k' specifies the output format # i) %i -- job ID (1st column) # ii) %256k -- comment with width of 256 characters (2nd column) # If the job has no comments, the entry simply reads (null) # 2) grep {{comment}} # Filter the jobs by the comment which must be unique per sbatchManager instance at all times # 3) awk '{print $1}' # Filter only the jobs IDs out # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g' # Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read) command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \ "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'" command = jinja2.Template(command_template).render( user=self.user, pool_id_length=self.max_pool_id_length, comment=self.pool_id, delimiter=delimiter) # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes # even if some of them have already finished jobIds_set = set([ id_ for id_ in self.jobIds if self.jobIds[id_]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) while nofJobs_left > 0: # Get the list of running jobs and convert them to a set poll_result, poll_result_err = '', '' while True: poll_result, poll_result_err = run_cmd(command, do_not_log=False, return_stderr=True) if not poll_result and poll_result_err: logging.warning( 'squeue caught an error: {squeue_error}'.format( squeue_error=poll_result_err)) else: break # sleep a minute and then try again # in principle we could limit the number of retries, but hopefully that's not necessary time.sleep(60) polled_ids = set(poll_result.split(delimiter)) # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of # jobs that have finished already finished_ids = list(jobIds_set - polled_ids) # Do not poll anything if currently there are no finished jobs if finished_ids: # Based on job's exit code what if the job has failed or completed successfully # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here # Therefore, we want to restrict the output by grepping specific job IDs # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable, # which is of order 2e6 # This means that we have to split the job IDs into chunks each of which we have to check separately finished_ids_chunks = [ finished_ids[i:i + self.max_nof_greps] for i in range(0, len(finished_ids), self.max_nof_greps) ] for finished_ids_chunk in finished_ids_chunks: completion = self.check_job_completion(finished_ids_chunk) completed_jobs, running_jobs, failed_jobs = [], [], [] for id_, details in completion.iteritems(): if details.status == Status.completed: completed_jobs.append(id_) elif details.status == Status.running: running_jobs.append(id_) else: failed_jobs.append(id_) # If there are any failed jobs, throw if failed_jobs: failed_jobs_str = ','.join(failed_jobs) errors = [ completion[id_].status for id_ in failed_jobs ] logging.error( "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}" .format( jobIds=failed_jobs_str, reasons=', '.join(map(Status.toString, errors)), )) # Let's print a table where the first column corresponds to the job ID # and the second column lists the exit code, the derived exit code, the status # and the classification of the failed job logging.error("Error table:") for id_ in failed_jobs: sys.stderr.write( "{jobId} {exitCode} {derivedExitCode} {state} {status}\n" .format( jobId=id_, exitCode=completion[id_].exit_code, derivedExitCode=completion[id_]. derived_exit_code, state=completion[id_].state, status=Status.toString( completion[id_].status), )) sys.stderr.write('%s\n' % text_line) for failed_job in failed_jobs: for log in zip(['wrapper', 'executable'], ['log_wrap', 'log_exec']): logfile = self.jobIds[failed_job][log[1]] if os.path.isfile(logfile): logfile_contents = open(logfile, 'r').read() else: logfile_contents = '<file is missing>' sys.stderr.write( 'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n' .format( id=failed_job, description=log[0], path=logfile, log=logfile_contents, line=text_line, )) # Raise the first error at hand raise Status.raiseError(errors[0]) else: logging.debug( "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}" .format( completedIds=','.join(completed_jobs), runningInfo='(%s still running)' % ','.join(running_jobs) if running_jobs else '', )) # Mark successfully finished jobs as completed so that won't request their status code again # Otherwise they will be still at ,,submitted'' state for id_ in completed_jobs: self.jobIds[id_]['status'] = Status.completed jobIds_set = set([ id_ for id_ in self.jobIds if self.jobIds[id_]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) if nofJobs_left > 0: two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_delay = 300 random_delay = random.randint(0, max_delay) time.sleep(self.poll_interval + random_delay) else: break logging.info( "Waiting for sbatch to finish (%d job(s) still left) ..." % nofJobs_left)
import os from tthAnalysis.HiggsToTauTau.jobTools import run_cmd # set tests to fastest priority queue allowed_sbatch_priorites = ['prio', 'test'] if not os.environ.get('SBATCH_PRIORITY') in allowed_sbatch_priorites: print( 'Will run tests in cluster using SBATCH_PRIORITY="prio". For faster execution on Quasar, use SBATCH_PRIORITY="test". ;)' ) os.environ['SBATCH_PRIORITY'] = 'prio' # initialize properties user = run_cmd('whoami').strip() cmssw_base = run_cmd('echo $CMSSW_BASE').strip() temp_dir = '/home/%s/tmp/' % user fixtures_dir = '%s/src/tthAnalysis/HiggsToTauTau/specification/fixtures/' % cmssw_base sbatch_priority = run_cmd('echo $SBATCH_PRIORITY').strip() scripts_dir = "%s/src/tthAnalysis/HiggsToTauTau/scripts/" % cmssw_base # create config config = { 'user': user, 'cmssw_base': cmssw_base, 'temp_dir': temp_dir, 'fixtures_dir': fixtures_dir, 'sbatch_priority': sbatch_priority, 'scripts_dir': scripts_dir }
def hadd(input_files, output_file): cmd_str = 'hadd -f %s %s' % (output_file, ' '.join(input_files)) stdout, stderr = run_cmd(cmd_str, do_not_log=True, return_stderr=True) if not stdout or stderr: raise RuntimeError('Error: %s' % stderr)
) parser.add_argument( '-v', '--verbose', dest='verbose', action='store_true', default=False, required=False, help='R|Verbose output', ) args = parser.parse_args() samples = load_dict(args.dictionary, args.sample_name) has_dasgoclient = run_cmd('which dasgoclient 2>/dev/null | wc -l', do_not_log=True, return_stderr=False) if has_dasgoclient.rstrip('\n') != "1": raise ValueError( "dasgoclient not available! Set up your 94x environment") has_voms_proxy = run_cmd('which voms-proxy-info 2>/dev/null | wc -l', do_not_log=True, return_stderr=False) if has_voms_proxy.rstrip('\n') != "1": raise ValueError( "voms-proxy-* not available! Set up your 94x environment") min_voms_proxy_timeleft_hours = 3 voms_proxy_timeleft = int( run_cmd('voms-proxy-info --timeleft', do_not_log=True,
def run(self): """Runs the complete analysis workfow -- either locally or on the batch system. """ run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs), False, self.stdout_file, self.stderr_file)
from tthAnalysis.HiggsToTauTau.jobTools import run_cmd config = { 'user': run_cmd('whoami').strip() }
def submitJob(self, inputFiles, executable, cfgFile, outputFilePath, outputFiles, logFile=None, skipIfOutputFileExists=False): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ # raise if logfile missing if not logFile: if not self.logFileDir: raise ValueError( "Please call 'setLogFileDir' before calling 'submitJob' !!") logFile = os.path.join(self.logFileDir, os.path.basename( script_file).replace(".sh", ".log")) # if any of the output files exists, returns (Margus: BUG? Because only # that file should be skipped, not all?) if skipIfOutputFileExists: for outputFile in outputFiles: if os.path.exists(os.path.join(outputFilePath, outputFile)): print "output file = '%s' exists --> skipping !!" % os.path.join(outputFilePath, outputFile) return if not self.workingDir: raise ValueError( "Please call 'setWorkingDir' before calling 'submitJob' !!") # create scratch dir scratchDir = "/scratch/%s" % getpass.getuser() if not os.path.exists(scratchDir): print "Directory '%s' does not yet exist, creating it !!" % scratchDir run_cmd(command_create_scratchDir) scratchDir = os.path.join( scratchDir, "tthAnalysis" + "_" + date.today().isoformat()) create_if_not_exists(scratchDir) # create script for executing jobs script_file = cfgFile.replace(".py", ".sh") script_file = script_file.replace("_cfg", "") wrapper_log_file = logFile.replace('.log', '_wrapper.log') executable_log_file = logFile.replace('.log', '_executable.log') command = "%s --partition=%s --output=%s %s" % ( self.command_submit, self.queue, wrapper_log_file, script_file) script = jinja2.Template(job_template).render( working_dir = self.workingDir, scratch_dir = scratchDir, exec_name = executable, cfg_file = cfgFile, inputFiles = " ".join(inputFiles), outputDir = outputFilePath, outputFiles = " ".join(outputFiles), wrapper_log_file = wrapper_log_file, executable_log_file = executable_log_file, RUNNING_COMMAND = command ) print "writing sbatch script file = '%s'" % script_file with codecs.open(script_file, "w", "utf-8") as f: f.write(script) print "<submitJob>: command = %s" % command run_cmd_output = run_cmd(command) print "run_cmd_output: %s" % run_cmd_output ret_val = run_cmd_output.split()[-1] print "ret_val: %s" % ret_val job_id = ret_val.split()[-1] # print " jobId = %s" % jobId self.jobIds.append(job_id)
def run(self): """Runs the complete analysis workfow -- either locally or on the batch system. """ run_cmd("make -f %s -j %i " % (self.makefile, self.num_parallel_jobs), False, self.stdout_file, self.stderr_file)
def check_job_completion(self, jobsId_list, default_completion=Status.completed): completion = { k: JobCompletion(status=default_completion) for k in jobsId_list } # If the input list is empty, just return here (we don't want to mess up the subprocess commands here) if not completion: return completion # Set a delimiter, which distinguishes entries b/w different jobs delimiter = ',' # First, let's try with sacct; explanation: # 1) sacct -X -P -n -o JobID,ExitCode,DerivedExitCode,State # Shows job IDs, exit codes and comments of all submitted, running and finished jobs, one line per job # a) -X -- shows cumulative statistics of each job (has no effect here, though) # b) -P -- output will be '|' delimited without a '|' at the end # c) -n -- omit header # d) -o JobID,ExitCode,DerivedExitCode -- output format # e) -S {datetime} -- look only for jobs submitted after {datetime} # f) -j {jobs} -- filter out only the relevant jobs by their job ID (comma-separated list) # 2) sed ':a;N;$!ba;s/\\n/{delimiter}/g' # Place all entries to one line, delimited by {{delimiter}} (otherwise the logs are hard to read) sacct_cmd = "sacct -X -P -n -o JobID,ExitCode,DerivedExitCode,State -S {datetime} -j {jobs} | " \ "sed ':a;N;$!ba;s/\\n/{delimiter}/g'".format( datetime = self.datetime, jobs = ','.join(jobsId_list), delimiter = delimiter, ) sacct_out, sacct_err = run_cmd(sacct_cmd, do_not_log=not self.log_completion, return_stderr=True) if not sacct_err and sacct_out: # The output of sacct contains one line per job, each line has pipe-separated fields the order of which # is defined in the command that issued the output lines = sacct_out.split(delimiter) for line in lines: JobID, ExitCode, DerivedExitCode, State = line.split('|') if JobID in completion: completion[JobID] = JobCompletion( status=Status.classify_error(ExitCode, DerivedExitCode, State), exit_code=ExitCode, derived_exit_code=DerivedExitCode, state=State, ) return completion else: # Likely returned along the lines of (due to heavy load on the cluster since SQL DB is overloaded): # sacct: error: Problem talking to the database: Connection refused logging.info('sacct currently unavailable: %s' % sacct_err) # Let's try with scontrol if the sacct commands failed # scontrol doesn't have an option to take a list of Job IDs as an argument; thus, we have to grep the job IDs # Explanation: # 1) scontrol show -od job # Prints out everything about running or recently finished jobs # a) -o -- prints information one line per record # b) -d -- includes more detailed information about the job # c) job -- prints all jobs (it's possible to get information about other units like nodes and clusters) # 2) grep '{jobs}' # Filter out jobs by their job ID (by concatenating the list with escaped regex OR operator '|') # 3) sed ':a;N;$!ba;s/\\n/{delimiter}/g' # Put all the result on one line, where each record is delimited by {delimiter} scontrol_cmd = "scontrol show -od job | grep '{jobs}' | sed ':a;N;$!ba;s/\\n/{delimiter}/g'".format( jobs='\\|'.join(jobsId_list), delimiter=delimiter, ) scontrol_out, scontrol_err = run_cmd( scontrol_cmd, do_not_log=not self.log_completion, return_stderr=True) if not scontrol_err and scontrol_out: # The output of scontrol contains one entry per line, each line contains a space-delimited key-value pairs, # whereas the keys and values are separated by an equation sign # Although the keys do not contain any spaces, the values might, so we have to take care of that lines = scontrol_out.split(delimiter) for line in lines: line_dict = {} line_split_eq_spaces = map(lambda x: x.split(), line.split('=')) for i in range(len(line_split_eq_spaces) - 1): k = line_split_eq_spaces[i] v = line_split_eq_spaces[i + 1] line_dict[k[-1]] = ' '.join( v[:-1] if i != len(line_split_eq_spaces) - 2 else v) if not 'JobId' in line_dict.keys(): print("Skipping line = '%s'" % line) continue JobId = line_dict['JobId'] if JobId in completion: completion[JobId] = JobCompletion( status=Status.classify_error( line_dict['ExitCode'], line_dict['DerivedExitCode'], line_dict['JobState'], ), exit_code=line_dict['ExitCode'], derived_exit_code=line_dict['DerivedExitCode'], state=line_dict['JobState']) return completion else: # scontrol probably returned something like: # slurm_load_jobs error: Invalid job id specified # Probably because too much time has passed since the job completion and checking the exit status here logging.info('scontrol has errors: %s' % scontrol_err) # scontrol still might fail if too much time has passed since the jobs completion (the metadata about each # job is cached for a certain period of time, the length of which I don't know at the moment) # None of the SLURM commands work; let's just say that the job completed successfully logging.error( "Cannot tell if the job has completed successfully or not!") return completion