def main(args): extractcmd = os.path.join(args.exe_dir, 'extract-pileup-chromosome ' + args.chromosome) processcmd = os.path.join(args.exe_dir, 'pileup-to-acgtn ' + args.ref_sequence) # create the jobs bsubs = [] for datapath in args.files: # define and create the output subdirectory outdir = os.path.join(args.out_dir, args.namescheme(datapath)) if not os.path.exists(outdir): os.makedirs(outdir) # Split the file into chunks to enable parallelism. # Each position has 4-byte A, C, G, T, and N counts, # for a total of 20 bytes. nbytes_per_position = 20 npositions_per_file = 1000000 # define the time and memory required for the job nminutes = 5 nkilobytes = 1000 # create the job b = lsf.Bsub() b.flags = { 'W': nminutes, 'M': nkilobytes, 'o': 'out', 'e': 'err'} if args.queue: b.flags['q'] = args.queue splitcmd = ' '.join([ 'split', '--bytes=%d' % (nbytes_per_position * npositions_per_file), '--suffix-length=4', '--numeric-suffixes', '-', os.path.join(outdir, 'x')]) cmd = 'zcat %s | %s | %s | %s' % ( datapath, extractcmd, processcmd, splitcmd) b.commands = [cmd] bsubs.append(b) # submit the jobs jnums = set() for b in bsubs: out, err = b.submit() sys.stdout.write(out) sys.stderr.write(err) jnums.add(b.job_number) # initialize the set of unfinished job numbers prev_unfinished = set(jnums) # wait for all of the jobs to finish while True: curr_unfinished = jnums & set(lsf.gen_unfinished_job_numbers()) newly_finished = prev_unfinished - curr_unfinished for jnum in newly_finished: print 'job', jnum, 'has finished' if not curr_unfinished: break prev_unfinished = curr_unfinished time.sleep(2.0)
""" Run a command line using bsub with some hardcoded parameters. """ import sys import time import lsf if __name__=='__main__': b = lsf.Bsub() b.flags = { 'n' : 1, 'W' : 720, 'q' : 'dean'} cmd = ' '.join("'%s'" % v for v in sys.argv[1:]) b.commands = [cmd] out, err = b.submit() sys.stdout.write(out) sys.stderr.write(err) while b.job_number not in lsf.gen_unfinished_job_numbers(): time.sleep(0.5)