Beispiel #1
0
def main(argv):
    parser = optparse.OptionParser()
    parser.add_option('-o', dest='output_file_name', default=None,
                      help="Destination file for the output. ")
    parser.add_option('-i', action='store', dest='input_file_name',
                      help="Write input file contents to command's stdin.")
    parser.add_option('-v', action='store_true', dest='verbose', default=False,
                      help="Display the script to be submitted.")
    parser.add_option('-q', action='store_true', dest='quiet', default=False,
                      help="Do not include information about processor nodes in the output, or the state of the job.")
    parser.add_option('-w', action='store', dest='script_name', default=None,
                      help="Save the script as this file name.")
    parser.add_option('-x', action='store_true', dest='dont_submit', default=False,
                      help="Don't submit the job.  This is really only useful when combined with '-w'.")

    parser.add_option('-W', action='store_true', dest='wait', default=False,
                      help="Wait for the job to finish, and display its output.")

    parser.add_option('-n', action='store', dest='job_name',  default=None,
                      help="PBS jobname")
    parser.add_option('-p', action='store', dest='numcpu', 
                      default=None,
                      help="Number of cpus to use.")
    parser.add_option('--num-nodes', action='store', dest='numnodes', 
                      default=None,
                      help="Number of nodes to use.")



    expect_another = False
    for idx, arg in enumerate(argv):
        if expect_another:
            expect_another = False
            continue
        if arg[0] == '-':
            if arg[-1] in ['o', 'i', 'w', 'n', 'p']:
                expect_another = True
            continue
        break

    if arg[0] == '-':
        # no command was given
        idx += 1

    (options, args) = parser.parse_args(argv[:idx])
    command_parts = argv[idx:]
    command = ' '.join(command_parts)


    with tempfile_util.Session(local=True) as session:
        # Pick a file to output to
        if options.output_file_name is not None:
            output_file_name = options.output_file_name
            wait = False
        elif options.script_name:
            output_file_name = options.script_name + '.out'
            session.add_name(output_file_name + '.err')
        else:
            output_file_name = session.temp_file_name('.submit_command.out')
            session.add_name(output_file_name + '.err')
            if options.script_name:
                print 'Outputting to ', output_file_name
            wait = True

        # Pick a script name
        if options.script_name is not None:
            script_file_name = options.script_name
        else:
            script_file_name = session.temp_file_name('.submit_command.pbs')

        job_id = None
        if options.job_name is not None:
            job_name = options.job_name
        elif output_file_name is not None:
            job_name = output_file_name
        else:
            job_name = filter(lambda x: x.isalpha(), command_parts[0])

        # Create the script
        run_command_here(script_file_name, command, input_file_name=options.input_file_name, output_file_name=output_file_name, job_name=job_name, verbose=not options.quiet, numcpu=options.numcpu, numnodes=options.numnodes)

        if options.verbose:
            with open(script_file_name) as script_file:
                print script_file.read()

        # Maybe submit it
        if options.script_name is not None:
            print 'Wrote %s.' % script_file_name


        if options.dont_submit:
            return
        else:
            try:
                if not options.quiet:
                    print 'Submitting...'
                job_id, = nice_submit.submit_files_until_done([script_file_name], wait_for_all=False,
                                                              quiet=options.quiet)

                # If they directed the output, just exit
                if not options.wait:
                    return

                if not options.quiet:
                    print 'Waiting on job ', job_id, ' to finish running.'
                pbs.qwait(job_id)

            except KeyboardInterrupt: 
                print '^C'
                if job_id: 
                    print 'Killing ', job_id
                    pbs.qdel(job_id)

            if os.path.exists(output_file_name):
                with open(output_file_name) as output_file:
                    print output_file.read()
            else:
                print "Output file doesn't seem to exist! Try:\ncat %s\n" % output_file_name
            if os.path.exists(output_file_name + '.err'):
                with open(output_file_name + '.err') as err_file:
                    print err_file.read()
Beispiel #2
0
    def start_more_clients(self):
        if not self.would_start_more_clients():
            return

        # alternative, turn the switch using the client names
        logging.info('checking if clients are failing...')
        clients = set(self.__clients())
        if self.dead_man_switch:
            self.working_clients = set(clients)
            self.clients_seen = set()
            logging.info('ok because of dead man switch')
        else:
            if self.working_clients.isdisjoint(clients):
                logging.info('none of the current clients have returned work')
                # It is not possible that I've seen any living client do work.
                if len(self.clients_seen) < self.desired_number_of_clients * 2:
                    # I have seen more than the desired number of clients be created
                    logging.info('ok because have not seen too many clients: %d < %d' % (len(self.clients_seen), self.desired_number_of_clients * 2))
                    self.clients_seen = self.clients_seen.union(clients)
                elif self.clients_seen.isdisjoint(clients):
                    # If the first 2*desired number of clients that I started since I've seen work have died, then jobs are failing.
                    # logging.info('FAILING because we have seen too many clients, and none of them are alive: %s !~= %s' % (self.clients_seen, clients))
                    raise PBSMapError("All clients are failing.")
                # else:
                #     logging.info('we have seen too many clients, but some are alive: seen=%s still_alive = %s' % (len(self.clients_seen), clients.intersection(self.clients_seen)))
            # else:
                # logging.info('ok because working clients are alive: %s ~= %s' % (self.working_clients, clients))

        # logging.info('clients since switch (%d) %s.' % (len(self.clients_seen), self.clients_seen))

        self.dead_man_switch = False

        client_script_file_name = self.client_script_file_name
        num_clients = self.desired_number_of_clients
        map_name = self.map_name
        address = self.address
        authkey = self.authkey

        num_living = len(clients)
        num_clients_to_start = num_clients - num_living
        logging.info('Starting %d clients.' % num_clients_to_start)
        if num_clients_to_start <= 0:
            return
        
        client_script_file_names = [client_script_file_name for count in xrange(num_clients_to_start)]

        try:
            ns.submit_files_until_done(client_script_file_names, 
                                       quiet=True,
                                       fail_when_max=True,
                                       retry_on_failure=False)
            if self.working_clients.isdisjoint(clients):
                if len(self.clients_seen) < self.desired_number_of_clients * 2:
                    self.clients_seen = self.clients_seen.union(set(self.__clients()).difference(self.working_clients))
        except ns.NiceSubmitError as e:
            if isinstance(e, ns.ReachedMax):
                logging.warning('pbs submission limit reached.')
            elif isinstance(e, ns.QSubFailure):
                logging.warning('qsub failed to submit a job.')
            else:
                logging.warning('Some Nice submit failure: %s' % e)
            num_living = len(self.__clients())
            logging.info('num living = %s' % num_living)
            if num_living == 0:
                logging.warning('Number of living clients is zero.  Nothing to do.')
                time.sleep(10)