def main(args):
    # define the path to the exe
    exepath = os.path.join(args.exe_dir, args.exe_name)
    # read the expected chromosome names
    with open(args.expected) as fin:
        expected_names = set(nonempty_stripped_lines(fin))
    # get the defined queues and put the debug queue at the front
    queues = list(lsf.gen_accessible_queues())
    i = queues.index('debug')
    queues[0], queues[i] = queues[i], queues[0]
    # define the switcher
    switcher = whackamole.Switcher(queues, switch)
    # create the jobs
    jobs = []
    for i, datapath in enumerate(args.files):
        gzname = os.path.basename(datapath)
        outpath = os.path.join(args.scratch, gzname + '.names.' + str(i))
        jobs.append(Job(datapath, exepath, outpath))
    number_to_job = {}
    # submit all of the jobs to the debug queue
    for job in jobs:
        job.submit('debug')
        number_to_job[job.b.job_number] = job
    # tell the switcher that all jobs were submitted to the debug queue
    switcher.on_submission({'debug': set(number_to_job)})
    # initialize the set of unfinished job numbers
    all_numbers = set(number_to_job)
    prev_unfinished = set(number_to_job)
    # go until all of the jobs have finished
    while prev_unfinished:
        time.sleep(2.0)
        # query the state of the submitted jobs
        jnum_state_queue_triples = list(lsf.bjobs())
        # possibly switch some queues
        qname_to_jnums = collections.defaultdict(set)
        for jnum, state, queue in jnum_state_queue_triples:
            if state == 'PEND':
                qname_to_jnums[queue].add(jnum)
        switcher.on_observation(qname_to_jnums)
        # check for finished jobs
        if jnum_state_queue_triples:
            jnums, states, queues = zip(*jnum_state_queue_triples)
        else:
            jnums, states, queues = [], [], []
        curr_unfinished = all_numbers & set(jnums)
        newly_finished = prev_unfinished - curr_unfinished
        for jnum in newly_finished:
            print 'job', jnum, 'has finished'
            number_to_job[jnum].validate(expected_names)
        prev_unfinished = curr_unfinished
Example #2
0
def main(args):
    nseconds = args.minutes * 60
    # define the path to the exe
    exepath = os.path.join(args.exe_dir, args.exe_name)
    # get the defined queues and put the debug queue at the front
    if not args.queue:
        queues = list(lsf.gen_accessible_queues())
        i = queues.index('debug')
        queues[0], queues[i] = queues[i], queues[0]
    # start writing the report
    with open(args.report, 'w') as fout:
        headings = [
                'N',
                'MEAN_SUCCESS_WALL_RUN_TIME',
                'SUCCESS_PROPORTION',
                'BATCH_WALL_TIME']
        print >> fout, '\t'.join(headings)
        # run the parallel processing for an increasing number of nodes
        for cap in gen_powers_of_two():
            filenames = args.files[:cap]
            if cap > len(args.files):
                break;
            # run a few jobs to completion
            tm_start = time.time()
            if args.queue:
                jnums = process_no_switching(
                        filenames, exepath, args.queue, args.minutes)
            else:
                jnums = process_switching(
                        filenames, exepath, queues, args.minutes)
            tm_end = time.time()
            # get the seconds of wall time each job was in the run state
            id_run_wall_pairs = list(lsf.bhist(jnums))
            # if a job took too long then it failed
            successes = [t for j, t in id_run_wall_pairs if t < nseconds]
            failures = [t for j, t in id_run_wall_pairs if t >= nseconds]
            nsuccesses = len(successes)
            nfailures = len(failures)
            if successes:
                mswrt = sum(successes) / float(nsuccesses)
            else:
                mswrt = '-'
            success_proportion = nsuccesses / float(cap)
            # write the results
            typed_row = [cap, mswrt, success_proportion, tm_end - tm_start]
            print >> fout, '\t'.join(str(x) for x in typed_row)