def main(argv): parser = optparse.OptionParser() parser.add_option('-o', dest='output_file_name', default=None, help="Destination file for the output. ") parser.add_option('-i', action='store', dest='input_file_name', help="Write input file contents to command's stdin.") parser.add_option('-v', action='store_true', dest='verbose', default=False, help="Display the script to be submitted.") parser.add_option('-q', action='store_true', dest='quiet', default=False, help="Do not include information about processor nodes in the output, or the state of the job.") parser.add_option('-w', action='store', dest='script_name', default=None, help="Save the script as this file name.") parser.add_option('-x', action='store_true', dest='dont_submit', default=False, help="Don't submit the job. This is really only useful when combined with '-w'.") parser.add_option('-W', action='store_true', dest='wait', default=False, help="Wait for the job to finish, and display its output.") parser.add_option('-n', action='store', dest='job_name', default=None, help="PBS jobname") parser.add_option('-p', action='store', dest='numcpu', default=None, help="Number of cpus to use.") parser.add_option('--num-nodes', action='store', dest='numnodes', default=None, help="Number of nodes to use.") expect_another = False for idx, arg in enumerate(argv): if expect_another: expect_another = False continue if arg[0] == '-': if arg[-1] in ['o', 'i', 'w', 'n', 'p']: expect_another = True continue break if arg[0] == '-': # no command was given idx += 1 (options, args) = parser.parse_args(argv[:idx]) command_parts = argv[idx:] command = ' '.join(command_parts) with tempfile_util.Session(local=True) as session: # Pick a file to output to if options.output_file_name is not None: output_file_name = options.output_file_name wait = False elif options.script_name: output_file_name = options.script_name + '.out' session.add_name(output_file_name + '.err') else: output_file_name = session.temp_file_name('.submit_command.out') session.add_name(output_file_name + '.err') if options.script_name: print 'Outputting to ', output_file_name wait = True # Pick a script name if options.script_name is not None: script_file_name = options.script_name else: script_file_name = session.temp_file_name('.submit_command.pbs') job_id = None if options.job_name is not None: job_name = options.job_name elif output_file_name is not None: job_name = output_file_name else: job_name = filter(lambda x: x.isalpha(), command_parts[0]) # Create the script run_command_here(script_file_name, command, input_file_name=options.input_file_name, output_file_name=output_file_name, job_name=job_name, verbose=not options.quiet, numcpu=options.numcpu, numnodes=options.numnodes) if options.verbose: with open(script_file_name) as script_file: print script_file.read() # Maybe submit it if options.script_name is not None: print 'Wrote %s.' % script_file_name if options.dont_submit: return else: try: if not options.quiet: print 'Submitting...' job_id, = nice_submit.submit_files_until_done([script_file_name], wait_for_all=False, quiet=options.quiet) # If they directed the output, just exit if not options.wait: return if not options.quiet: print 'Waiting on job ', job_id, ' to finish running.' pbs.qwait(job_id) except KeyboardInterrupt: print '^C' if job_id: print 'Killing ', job_id pbs.qdel(job_id) if os.path.exists(output_file_name): with open(output_file_name) as output_file: print output_file.read() else: print "Output file doesn't seem to exist! Try:\ncat %s\n" % output_file_name if os.path.exists(output_file_name + '.err'): with open(output_file_name + '.err') as err_file: print err_file.read()
def start_more_clients(self): if not self.would_start_more_clients(): return # alternative, turn the switch using the client names logging.info('checking if clients are failing...') clients = set(self.__clients()) if self.dead_man_switch: self.working_clients = set(clients) self.clients_seen = set() logging.info('ok because of dead man switch') else: if self.working_clients.isdisjoint(clients): logging.info('none of the current clients have returned work') # It is not possible that I've seen any living client do work. if len(self.clients_seen) < self.desired_number_of_clients * 2: # I have seen more than the desired number of clients be created logging.info('ok because have not seen too many clients: %d < %d' % (len(self.clients_seen), self.desired_number_of_clients * 2)) self.clients_seen = self.clients_seen.union(clients) elif self.clients_seen.isdisjoint(clients): # If the first 2*desired number of clients that I started since I've seen work have died, then jobs are failing. # logging.info('FAILING because we have seen too many clients, and none of them are alive: %s !~= %s' % (self.clients_seen, clients)) raise PBSMapError("All clients are failing.") # else: # logging.info('we have seen too many clients, but some are alive: seen=%s still_alive = %s' % (len(self.clients_seen), clients.intersection(self.clients_seen))) # else: # logging.info('ok because working clients are alive: %s ~= %s' % (self.working_clients, clients)) # logging.info('clients since switch (%d) %s.' % (len(self.clients_seen), self.clients_seen)) self.dead_man_switch = False client_script_file_name = self.client_script_file_name num_clients = self.desired_number_of_clients map_name = self.map_name address = self.address authkey = self.authkey num_living = len(clients) num_clients_to_start = num_clients - num_living logging.info('Starting %d clients.' % num_clients_to_start) if num_clients_to_start <= 0: return client_script_file_names = [client_script_file_name for count in xrange(num_clients_to_start)] try: ns.submit_files_until_done(client_script_file_names, quiet=True, fail_when_max=True, retry_on_failure=False) if self.working_clients.isdisjoint(clients): if len(self.clients_seen) < self.desired_number_of_clients * 2: self.clients_seen = self.clients_seen.union(set(self.__clients()).difference(self.working_clients)) except ns.NiceSubmitError as e: if isinstance(e, ns.ReachedMax): logging.warning('pbs submission limit reached.') elif isinstance(e, ns.QSubFailure): logging.warning('qsub failed to submit a job.') else: logging.warning('Some Nice submit failure: %s' % e) num_living = len(self.__clients()) logging.info('num living = %s' % num_living) if num_living == 0: logging.warning('Number of living clients is zero. Nothing to do.') time.sleep(10)