def main(args): # define the path to the exe exepath = os.path.join(args.exe_dir, args.exe_name) # read the expected chromosome names with open(args.expected) as fin: expected_names = set(nonempty_stripped_lines(fin)) # get the defined queues and put the debug queue at the front queues = list(lsf.gen_accessible_queues()) i = queues.index('debug') queues[0], queues[i] = queues[i], queues[0] # define the switcher switcher = whackamole.Switcher(queues, switch) # create the jobs jobs = [] for i, datapath in enumerate(args.files): gzname = os.path.basename(datapath) outpath = os.path.join(args.scratch, gzname + '.names.' + str(i)) jobs.append(Job(datapath, exepath, outpath)) number_to_job = {} # submit all of the jobs to the debug queue for job in jobs: job.submit('debug') number_to_job[job.b.job_number] = job # tell the switcher that all jobs were submitted to the debug queue switcher.on_submission({'debug': set(number_to_job)}) # initialize the set of unfinished job numbers all_numbers = set(number_to_job) prev_unfinished = set(number_to_job) # go until all of the jobs have finished while prev_unfinished: time.sleep(2.0) # query the state of the submitted jobs jnum_state_queue_triples = list(lsf.bjobs()) # possibly switch some queues qname_to_jnums = collections.defaultdict(set) for jnum, state, queue in jnum_state_queue_triples: if state == 'PEND': qname_to_jnums[queue].add(jnum) switcher.on_observation(qname_to_jnums) # check for finished jobs if jnum_state_queue_triples: jnums, states, queues = zip(*jnum_state_queue_triples) else: jnums, states, queues = [], [], [] curr_unfinished = all_numbers & set(jnums) newly_finished = prev_unfinished - curr_unfinished for jnum in newly_finished: print 'job', jnum, 'has finished' number_to_job[jnum].validate(expected_names) prev_unfinished = curr_unfinished
def main(args): nseconds = args.minutes * 60 # define the path to the exe exepath = os.path.join(args.exe_dir, args.exe_name) # get the defined queues and put the debug queue at the front if not args.queue: queues = list(lsf.gen_accessible_queues()) i = queues.index('debug') queues[0], queues[i] = queues[i], queues[0] # start writing the report with open(args.report, 'w') as fout: headings = [ 'N', 'MEAN_SUCCESS_WALL_RUN_TIME', 'SUCCESS_PROPORTION', 'BATCH_WALL_TIME'] print >> fout, '\t'.join(headings) # run the parallel processing for an increasing number of nodes for cap in gen_powers_of_two(): filenames = args.files[:cap] if cap > len(args.files): break; # run a few jobs to completion tm_start = time.time() if args.queue: jnums = process_no_switching( filenames, exepath, args.queue, args.minutes) else: jnums = process_switching( filenames, exepath, queues, args.minutes) tm_end = time.time() # get the seconds of wall time each job was in the run state id_run_wall_pairs = list(lsf.bhist(jnums)) # if a job took too long then it failed successes = [t for j, t in id_run_wall_pairs if t < nseconds] failures = [t for j, t in id_run_wall_pairs if t >= nseconds] nsuccesses = len(successes) nfailures = len(failures) if successes: mswrt = sum(successes) / float(nsuccesses) else: mswrt = '-' success_proportion = nsuccesses / float(cap) # write the results typed_row = [cap, mswrt, success_proportion, tm_end - tm_start] print >> fout, '\t'.join(str(x) for x in typed_row)