def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): print "Error: BAM file %s not found." %(self.bam_filename) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): print "WARNING: Expected BAM index file %s not found." \ %(self.bam_index_fname) print "Are you sure your BAM file is indexed?" self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc print "Using %d processors" %(num_proc) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: print "Error: No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?" sys.exit(1) self.batch_filenames = self.output_batch_files()
def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, main_logger, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.main_logger = main_logger self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): self.main_logger.error("BAM file %s not found." %(self.bam_filename)) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): self.main_logger.warning("Expected BAM index file %s not found." \ %(self.bam_index_fname)) self.main_logger.warning("Are you sure your BAM file is indexed?") self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc self.main_logger.info("Using %d processors" %(num_proc)) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: self.main_logger.error("No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?") sys.exit(1) self.batch_filenames = self.output_batch_files()
def run(self, delay_constant=0.9): """ Run batches either locally on multi-cores or using cluster. """ batch_filenames = self.output_batch_files() # All MISO commands, each correspond to a batch, # and the number of jobs in each batch all_miso_cmds = [] num_batches = len(batch_filenames) ## ## Prepare all the files necessary to run each batch ## print "Preparing to run %d batches of jobs..." % (num_batches) miso_run = os.path.join(miso_path, "run_miso.py") for batch_num, batch in enumerate(batch_filenames): batch_filename, batch_size = batch miso_cmd = \ "python %s --compute-genes-from-file \"%s\" %s %s --read-len %d " \ %(miso_run, batch_filename, self.bam_filename, self.output_dir, self.read_len) # Add paired-end parameters and read len/overhang len if self.paired_end != None: # Run in paired-end mode frag_mean = float(self.paired_end[0]) frag_sd = float(self.paired_end[1]) miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd) else: # Overhang len only used in single-end mode miso_cmd += " --overhang-len %d" % (self.overhang_len) # Add settings filename if given if self.settings_fname != None: miso_cmd += " --settings-filename %s" \ %(self.settings_fname) all_miso_cmds.append((miso_cmd, batch_size)) ## ## Run all MISO commands for the batches ## either locally using multi-cores or on cluster ## # First handle special case of SGE cluster submission if self.use_cluster and self.SGEarray: print "Using SGEarray..." # Call SGE batch_argfile = os.path.join(self.cluster_scripts_dir, "run_args.txt") cluster_utils.run_SGEarray_cluster(all_miso_cmds, batch_argfile, self.output_dir, settings=self.settings_fname, job_name=self.sge_job_name, chunk=self.chunk_jobs) # End SGE case return # All cluster jobs cluster_jobs = [] for batch_num, cmd_info in enumerate(all_miso_cmds): miso_cmd, batch_size = cmd_info print "Running batch of %d genes.." % (batch_size) print " - Executing: %s" % (miso_cmd) # Make a log file for the batch, where all the output # will be redirected time_str = time.strftime("%m-%d-%y_%H:%M:%S") batch_logfile = os.path.join( self.batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str)) cmd_to_run = "%s >> \"%s\";" % (miso_cmd, batch_logfile) if not self.use_cluster: # Run locally p = subprocess.Popen(cmd_to_run, shell=True) thread_id = "batch-%d" % (batch_num) print " - Submitted thread %s" % (thread_id) self.threads[thread_id] = p else: # Setup cluster engine Settings.load(self.settings_fname) clustercmd = Settings.get_cluster_command() self.cluster_engine = getClusterEngine(clustercmd, self.settings_fname) # Run on cluster if batch_size >= self.long_thresh: queue_type = "long" else: queue_type = "short" # Run on cluster job_name = "gene_psi_batch_%d" % (batch_num) print "Submitting to cluster: %s" % (cmd_to_run) job_id = \ self.cluster_engine.run_on_cluster(cmd_to_run, job_name, self.output_dir, queue_type=queue_type) if job_id is not None: cluster_jobs.append(job_id) time.sleep(delay_constant) # Extra delay constant time.sleep(delay_constant) # If ran jobs on cluster, wait for them if there are any # to wait on. if self.wait_on_jobs: if self.use_cluster and (len(cluster_jobs) == 0): # If we're asked to use the cluster but the list # of cluster jobs is empty, it means we could not # find the IDs of the job from the submission # system. Report this to the user. self.main_logger.warning("Asked to wait on cluster jobs but cannot " \ "parse their job IDs from the cluster submission " \ "system.") # Try to wait on jobs no matter what; though if 'cluster_jobs' # is empty here, it will not wait self.cluster_engine.wait_on_jobs(cluster_jobs, self.cluster_cmd) else: if self.use_cluster: # If we're running in cluster mode and asked not # to wait for jobs, let user know self.main_logger.info("Not waiting on cluster jobs.") # If ran jobs locally, wait on them to finish # (this will do nothing if we submitted jobs to # cluster) self.wait_on_threads()