def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir, use_cluster=False, chunk_jobs=200, overhang_len=1, paired_end=None, settings=None): """ Compute Psi values for genes using a GFF and a BAM filename. """ gene_ids_to_gff_index = gff_utils.get_gene_ids_to_gff_index(gff_dir) num_genes = len(gene_ids_to_gff_index.keys()) miso_run = os.path.join(miso_path, "run_miso.py") print "Computing gene-level Psi for %d genes..." \ %(num_genes) print " - GFF index: %s" %(gff_dir) print " - BAM: %s" %(bam_filename) print " - Read length: %d" %(read_len) print " - Output directory: %s" %(output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) # All commands to run all_miso_cmds = [] for gene_id, gff_index_filename in gene_ids_to_gff_index.iteritems(): miso_cmd = "python %s --compute-gene-psi \"%s\" \"%s\" %s %s --read-len %d " \ %(miso_run, gene_id, gff_index_filename, bam_filename, output_dir, read_len) if paired_end != None: # Run in paired-end mode frag_mean = float(paired_end[0]) frag_sd = float(paired_end[1]) miso_cmd += " --paired-end %.1f %.1f" %(frag_mean, frag_sd) else: miso_cmd += " --overhang-len %d" %(overhang_len) # Add settings filename if given if settings != None: miso_cmd += " --settings-filename %s" %(settings) if use_cluster: # If asked to use cluster, accumulate the MISO commands # but do not run them all_miso_cmds.append(miso_cmd) else: print " - Executing: %s" %(miso_cmd) os.system(miso_cmd) miso_settings = Settings.load(settings) if use_cluster: # Threshold for putting jobs in the long queue long_thresh = 50 # Delay between jobs delay_constant = 0.9 # Invoke the commands using the cluster print "Sending %d genes to be run on cluster in chunks of %d..." \ %(num_genes, chunk_jobs) if not chunk_jobs: print " - Using default chunk jobs = %d" %(200) chunk_jobs = 200 chunk_jobs = max(1, int(round(num_genes / float(chunk_jobs)))) # Split the gene records into batches cmd_batches = cluster_utils.chunk_list(all_miso_cmds, chunk_jobs) time_str = time.strftime("%m-%d-%y_%H:%M:%S") for batch_num, batch in enumerate(cmd_batches): batch_size = len(batch) print "Running batch %d (batch size = %d)" %(batch_num, batch_size) if batch_size >= long_thresh: queue_type = "long" else: queue_type = "short" # Pool all the MISO commands belonging to this batch batch_logs_dir = os.path.join(output_dir, "batch-logs") if not os.path.isdir(batch_logs_dir): os.makedirs(batch_logs_dir) batch_logfile = os.path.join(batch_logs_dir, "batch-%d-%s.log" %(batch_num, time_str)) redirected_output = " >> %s;\n" %(batch_logfile) cmd_to_run = redirected_output.join(batch) # Run on cluster job_name = "gene_psi_batch_%d" %(batch_num) cluster_utils.run_on_cluster(cmd_to_run, job_name, output_dir, queue_type=queue_type, settings=settings) time.sleep(delay_constant)
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir, use_cluster=False, chunk_jobs=200, overhang_len=1, paired_end=None, settings=None): """ Compute Psi values for genes using a GFF and a BAM filename. """ gene_ids_to_gff_index = gff_utils.get_gene_ids_to_gff_index(gff_dir) num_genes = len(gene_ids_to_gff_index.keys()) miso_run = os.path.join(miso_path, "run_miso.py") print "Computing gene-level Psi for %d genes..." \ %(num_genes) print " - GFF index: %s" % (gff_dir) print " - BAM: %s" % (bam_filename) print " - Read length: %d" % (read_len) print " - Output directory: %s" % (output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) # All commands to run all_miso_cmds = [] for gene_id, gff_index_filename in gene_ids_to_gff_index.iteritems(): miso_cmd = "python %s --compute-gene-psi \"%s\" \"%s\" %s %s --read-len %d " \ %(miso_run, gene_id, gff_index_filename, bam_filename, output_dir, read_len) if paired_end != None: # Run in paired-end mode frag_mean = float(paired_end[0]) frag_sd = float(paired_end[1]) miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd) else: miso_cmd += " --overhang-len %d" % (overhang_len) # Add settings filename if given if settings != None: miso_cmd += " --settings-filename %s" % (settings) if use_cluster: # If asked to use cluster, accumulate the MISO commands # but do not run them all_miso_cmds.append(miso_cmd) else: print " - Executing: %s" % (miso_cmd) os.system(miso_cmd) miso_settings = Settings.load(settings) if use_cluster: # Threshold for putting jobs in the long queue long_thresh = 50 # Delay between jobs delay_constant = 0.9 # Invoke the commands using the cluster print "Sending %d genes to be run on cluster in chunks of %d..." \ %(num_genes, chunk_jobs) if not chunk_jobs: print " - Using default chunk jobs = %d" % (200) chunk_jobs = 200 chunk_jobs = max(1, int(round(num_genes / float(chunk_jobs)))) # Split the gene records into batches cmd_batches = cluster_utils.chunk_list(all_miso_cmds, chunk_jobs) time_str = time.strftime("%m-%d-%y_%H:%M:%S") for batch_num, batch in enumerate(cmd_batches): batch_size = len(batch) print "Running batch %d (batch size = %d)" % (batch_num, batch_size) if batch_size >= long_thresh: queue_type = "long" else: queue_type = "short" # Pool all the MISO commands belonging to this batch batch_logs_dir = os.path.join(output_dir, "batch-logs") if not os.path.isdir(batch_logs_dir): os.makedirs(batch_logs_dir) batch_logfile = os.path.join( batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str)) redirected_output = " >> %s;\n" % (batch_logfile) cmd_to_run = redirected_output.join(batch) # Run on cluster job_name = "gene_psi_batch_%d" % (batch_num) cluster_utils.run_on_cluster(cmd_to_run, job_name, output_dir, queue_type=queue_type, settings=settings) time.sleep(delay_constant)
def run_two_iso_on_cluster(miso_path, events_filename, event_type, psi_outdir, read_len, overhang_len, chunk_jobs=False): """ Run two-isoform MISO on cluster. - chunk_jobs: the number of jobs in each batch. All jobs in a batch will be assigned to the same processor on the cluster. When chunk_jobs is not specified, each event gets sent as a separate job. """ print "Running two isoform MISO on cluster..." # Load two-isoform events miso_events = as_events.MISOEvents(2, event_type, from_file=events_filename) num_total_events = len(miso_events.events) delay_constant = 0.9 if not chunk_jobs: event_batches = [miso_events.events] else: # Make sure we're chunking into more than one batch of jobs assert(chunk_jobs > 1) # Compute number of chunks we'd need to split all events to in order to get # 'chunk_jobs'-many events in a job chunk_jobs = int(round(num_total_events / float(chunk_jobs))) print "Splitting events into %d chunks..." %(chunk_jobs) event_names = miso_events.events.keys() event_batches = cluster_utils.chunk_list(event_names, chunk_jobs) print " - Total of %d event batches." %(len(event_batches)) batch_lens = [len(batch) for batch in event_batches] max_events_per_batch = max(batch_lens) queue_thresh = 50 num_batches = len(event_batches) long_batch = 100 if max_events_per_batch >= queue_thresh and max_events_per_batch <= long_batch: print "Longest batch contains more than %d jobs -- changing queue type to short" \ %(queue_thresh) queue_type = 'short' else: print "Longest batch contains more than %d jobs -- changing queue type to long" \ %(long_batch) queue_type = 'long' for event_batch in event_batches: # Compile set of commands that will be run in the same job miso_event_cmd_list = [] num_jobs_per_batch = len(event_batch) print "Processing a batch of size %d events" %(num_jobs_per_batch) for event_name in event_batch: miso_event_cmd = '%s %s --run-two-iso-event \"%s\" %s %s --event-type %s --read-len %d --overhang-len %d' \ %(python_exe, os.path.join(miso_path, 'run_miso.py'), event_name, events_filename, psi_outdir, event_type, read_len, overhang_len) miso_event_cmd_list.append(miso_event_cmd) # Execute events in batch miso_event_batch_cmd = "; ".join(miso_event_cmd_list) #print "Executing batched command list: ", miso_event_batch_cmd if num_batches > 1: event_name += "_batch" cluster_utils.run_on_cluster(miso_event_batch_cmd, event_name, psi_outdir, queue_type=queue_type) # Add pause to allow cluster to process jobs time.sleep(delay_constant) # Parse all events into genes events_to_genes = miso_events.loaded_events_to_genes(read_len=read_len, overhang_len=overhang_len)
def run_two_iso_on_cluster(miso_path, events_filename, event_type, psi_outdir, read_len, overhang_len, chunk_jobs=False): """ Run two-isoform MISO on cluster. - chunk_jobs: the number of jobs in each batch. All jobs in a batch will be assigned to the same processor on the cluster. When chunk_jobs is not specified, each event gets sent as a separate job. """ print "Running two isoform MISO on cluster..." # Load two-isoform events miso_events = as_events.MISOEvents(2, event_type, from_file=events_filename) num_total_events = len(miso_events.events) delay_constant = 0.9 if not chunk_jobs: event_batches = [miso_events.events] else: # Make sure we're chunking into more than one batch of jobs assert (chunk_jobs > 1) # Compute number of chunks we'd need to split all events to in order to get # 'chunk_jobs'-many events in a job chunk_jobs = int(round(num_total_events / float(chunk_jobs))) print "Splitting events into %d chunks..." % (chunk_jobs) event_names = miso_events.events.keys() event_batches = cluster_utils.chunk_list(event_names, chunk_jobs) print " - Total of %d event batches." % (len(event_batches)) batch_lens = [len(batch) for batch in event_batches] max_events_per_batch = max(batch_lens) queue_thresh = 50 num_batches = len(event_batches) long_batch = 100 if max_events_per_batch >= queue_thresh and max_events_per_batch <= long_batch: print "Longest batch contains more than %d jobs -- changing queue type to short" \ %(queue_thresh) queue_type = 'short' else: print "Longest batch contains more than %d jobs -- changing queue type to long" \ %(long_batch) queue_type = 'long' for event_batch in event_batches: # Compile set of commands that will be run in the same job miso_event_cmd_list = [] num_jobs_per_batch = len(event_batch) print "Processing a batch of size %d events" % (num_jobs_per_batch) for event_name in event_batch: miso_event_cmd = 'python %s --run-two-iso-event \"%s\" %s %s --event-type %s --read-len %d --overhang-len %d' \ %(os.path.join(miso_path, 'run_miso.py'), event_name, events_filename, psi_outdir, event_type, read_len, overhang_len) miso_event_cmd_list.append(miso_event_cmd) # Execute events in batch miso_event_batch_cmd = "; ".join(miso_event_cmd_list) #print "Executing batched command list: ", miso_event_batch_cmd if num_batches > 1: event_name += "_batch" cluster_utils.run_on_cluster(miso_event_batch_cmd, event_name, psi_outdir, queue_type=queue_type) # Add pause to allow cluster to process jobs time.sleep(delay_constant) # Parse all events into genes events_to_genes = miso_events.loaded_events_to_genes( read_len=read_len, overhang_len=overhang_len)