Ejemplo n.º 1
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         print "Error: BAM file %s not found." %(self.bam_filename)
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         print "WARNING: Expected BAM index file %s not found." \
             %(self.bam_index_fname)
         print "Are you sure your BAM file is indexed?"
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         print "Using %d processors" %(num_proc)
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         print "Error: No genes to run on. Did you pass me the wrong path " \
               "to your index GFF directory? " \
               "Or perhaps your indexed GFF directory " \
               "is empty?"
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
Ejemplo n.º 2
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              main_logger,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.main_logger = main_logger
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         self.main_logger.error("BAM file %s not found." %(self.bam_filename))
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         self.main_logger.warning("Expected BAM index file %s not found." \
                             %(self.bam_index_fname))
         self.main_logger.warning("Are you sure your BAM file is indexed?")
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         self.main_logger.info("Using %d processors" %(num_proc))
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         self.main_logger.error("No genes to run on. Did you pass me the wrong path " \
                                "to your index GFF directory? " \
                                "Or perhaps your indexed GFF directory " \
                                "is empty?")
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
Ejemplo n.º 3
0
    def run(self, delay_constant=0.9):
        """
        Run batches either locally on multi-cores
        or using cluster.
        """
        batch_filenames = self.output_batch_files()
        # All MISO commands, each correspond to a batch,
        # and the number of jobs in each batch
        all_miso_cmds = []
        num_batches = len(batch_filenames)
        ##
        ## Prepare all the files necessary to run each batch
        ##
        print "Preparing to run %d batches of jobs..." % (num_batches)
        miso_run = os.path.join(miso_path, "run_miso.py")
        for batch_num, batch in enumerate(batch_filenames):
            batch_filename, batch_size = batch
            miso_cmd = \
              "python %s --compute-genes-from-file \"%s\" %s %s --read-len %d " \
                    %(miso_run,
                      batch_filename,
                      self.bam_filename,
                      self.output_dir,
                      self.read_len)
            # Add paired-end parameters and read len/overhang len
            if self.paired_end != None:
                # Run in paired-end mode
                frag_mean = float(self.paired_end[0])
                frag_sd = float(self.paired_end[1])
                miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd)
            else:
                # Overhang len only used in single-end mode
                miso_cmd += " --overhang-len %d" % (self.overhang_len)
            # Add settings filename if given
            if self.settings_fname != None:
                miso_cmd += " --settings-filename %s" \
                    %(self.settings_fname)
            all_miso_cmds.append((miso_cmd, batch_size))
        ##
        ## Run all MISO commands for the batches
        ## either locally using multi-cores or on cluster
        ##
        # First handle special case of SGE cluster submission
        if self.use_cluster and self.SGEarray:
            print "Using SGEarray..."
            # Call SGE
            batch_argfile = os.path.join(self.cluster_scripts_dir,
                                         "run_args.txt")
            cluster_utils.run_SGEarray_cluster(all_miso_cmds,
                                               batch_argfile,
                                               self.output_dir,
                                               settings=self.settings_fname,
                                               job_name=self.sge_job_name,
                                               chunk=self.chunk_jobs)
            # End SGE case
            return

        # All cluster jobs
        cluster_jobs = []
        for batch_num, cmd_info in enumerate(all_miso_cmds):
            miso_cmd, batch_size = cmd_info
            print "Running batch of %d genes.." % (batch_size)
            print "  - Executing: %s" % (miso_cmd)
            # Make a log file for the batch, where all the output
            # will be redirected
            time_str = time.strftime("%m-%d-%y_%H:%M:%S")
            batch_logfile = os.path.join(
                self.batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str))
            cmd_to_run = "%s >> \"%s\";" % (miso_cmd, batch_logfile)
            if not self.use_cluster:
                # Run locally
                p = subprocess.Popen(cmd_to_run, shell=True)
                thread_id = "batch-%d" % (batch_num)
                print "  - Submitted thread %s" % (thread_id)
                self.threads[thread_id] = p
            else:
                # Setup cluster engine
                Settings.load(self.settings_fname)
                clustercmd = Settings.get_cluster_command()

                self.cluster_engine = getClusterEngine(clustercmd,
                                                       self.settings_fname)

                # Run on cluster
                if batch_size >= self.long_thresh:
                    queue_type = "long"
                else:
                    queue_type = "short"
                # Run on cluster
                job_name = "gene_psi_batch_%d" % (batch_num)
                print "Submitting to cluster: %s" % (cmd_to_run)
                job_id = \
                    self.cluster_engine.run_on_cluster(cmd_to_run,
                                                 job_name,
                                                 self.output_dir,
                                                 queue_type=queue_type)
                if job_id is not None:
                    cluster_jobs.append(job_id)
                time.sleep(delay_constant)
            # Extra delay constant
            time.sleep(delay_constant)
        # If ran jobs on cluster, wait for them if there are any
        # to wait on.
        if self.wait_on_jobs:
            if self.use_cluster and (len(cluster_jobs) == 0):
                # If we're asked to use the cluster but the list
                # of cluster jobs is empty, it means we could not
                # find the IDs of the job from the submission
                # system. Report this to the user.
                self.main_logger.warning("Asked to wait on cluster jobs but cannot " \
                                         "parse their job IDs from the cluster submission " \
                                         "system.")
            # Try to wait on jobs no matter what; though if 'cluster_jobs'
            # is empty here, it will not wait
            self.cluster_engine.wait_on_jobs(cluster_jobs, self.cluster_cmd)
        else:
            if self.use_cluster:
                # If we're running in cluster mode and asked not
                # to wait for jobs, let user know
                self.main_logger.info("Not waiting on cluster jobs.")
        # If ran jobs locally, wait on them to finish
        # (this will do nothing if we submitted jobs to
        # cluster)
        self.wait_on_threads()