Ejemplo n.º 1
0
def get_main_logger(log_outdir,
                    level=logging.WARNING,
                    include_stdout=True):
    """
    Return logger object for main MISO thread.
    """
    logger_name = "miso_main"
    misc_utils.make_dir(log_outdir)
    logger = logging.getLogger(logger_name)
    formatter = \
        logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                          datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.root.setLevel(level)
    # Optionally add handler that streams all logs
    # to stdout
    if include_stdout:
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(level)
        ch.setFormatter(formatter)
        logger.addHandler(ch)
    # Write to main logger filename along
    # with time stamp
    logger_basename = "main.%s.log" %(misc_utils.get_timestamp())
    logger_fname = os.path.join(log_outdir, logger_basename)
    fh = logging.FileHandler(logger_fname)
    fh.setLevel(level)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger
Ejemplo n.º 2
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
                      help="Compute comparison statistics between the two " \
                      "given samples. Expects three directories: the first is " \
                      "sample1's MISO output, the second is sample2's MISO " \
                      "output, and the third is the directory where " \
                      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels",
                      dest="comparison_labels",
                      nargs=2,
                      default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--use-compressed",
                      dest="use_compressed",
                      nargs=1,
                      default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    (options, args) = parser.parse_args()

    if options.samples_to_compare is None:
        greeting()

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."

    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
        sample2_dirname = os.path.abspath(options.samples_to_compare[1])
        output_dirname = os.path.abspath(options.samples_to_compare[2])
        if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" % (output_dirname)
            misc_utils.make_dir(output_dirname)
        ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
Ejemplo n.º 3
0
def clear_output_dir():
    output_dir = OUTPUT_DIR
    # Clear out the previous test output directory
    print "Clearing previous output directory..."
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir, ignore_errors=True)
    # Make new output directory
    misc_utils.make_dir(output_dir)
Ejemplo n.º 4
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
		      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    (options, args) = parser.parse_args()

    greeting()

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."

    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
	samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
	assert(len(samples_label) >= 1)
	summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
	if not os.path.isdir(summary_output_dir):
            misc_utils.make_dir(summary_output_dir)
	    
	summary_filename = os.path.join(summary_output_dir,
					'%s.miso_summary' %(samples_label))
	samples_utils.summarize_sampler_results(samples_dir,
                                            summary_filename,
                                            use_compressed=use_compressed)
Ejemplo n.º 5
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
                      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    (options, args) = parser.parse_args()

    greeting()

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."

    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
        samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
        assert(len(samples_label) >= 1)
        summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
        if not os.path.isdir(summary_output_dir):
            misc_utils.make_dir(summary_output_dir)

        summary_filename = os.path.join(summary_output_dir,
                                        '%s.miso_summary' %(samples_label))
        samples_utils.summarize_sampler_results(samples_dir,
                                            summary_filename,
                                            use_compressed=use_compressed)
Ejemplo n.º 6
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    
    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
                      help="Compute comparison statistics between the two " \
                      "given samples. Expects three directories: the first is " \
                      "sample1's MISO output, the second is sample2's MISO " \
                      "output, and the third is the directory where " \
                      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    (options, args) = parser.parse_args()

    if options.samples_to_compare is None:
        greeting()

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."
            
    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
	sample2_dirname = os.path.abspath(options.samples_to_compare[1])
	output_dirname = os.path.abspath(options.samples_to_compare[2])
	if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
	ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
Ejemplo n.º 7
0
    def run_on_cluster(self,
                       cmd,
                       job_name,
                       cluster_output_dir,
                       cluster_scripts_dir=None,
                       queue_type=None):
        '''
        Composes job script and launches job
        '''

        misc_utils.make_dir(cluster_output_dir)
        if cluster_scripts_dir == None:
            cluster_scripts_dir = os.path.join(cluster_output_dir,
                                               'cluster_scripts')
            misc_utils.make_dir(cluster_scripts_dir)

        scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output')
        misc_utils.make_dir(scripts_output_dir)
        scripts_output_dir = os.path.abspath(scripts_output_dir)
        cluster_call = 'sbatch -D \"%s\"' % (scripts_output_dir)

        script_name = os.path.join(cluster_scripts_dir,
                                         '%s_time_%s.sh' \
                                         %(job_name,
                                           time.strftime("%m-%d-%y_%H_%M_%S")))
        self.make_bash_script(script_name, cmd)
        cluster_cmd = cluster_call + ' \"%s\"' % (script_name)
        job_id = self.launch_job(cluster_cmd)
        return job_id
Ejemplo n.º 8
0
def run_on_cluster(cmd,
                   job_name,
                   cluster_output_dir,
                   cluster_scripts_dir=None,
                   queue_type=None,
                   cmd_name="qsub",
                   settings_fname=None):
    print "Submitting job: %s" % (job_name)
    queue_name = None

    # Load command name from settings file
    if settings_fname != None:
        load_settings(settings_fname)
        cmd_name = Settings.get_cluster_command()

    if queue_type == "long":
        queue_name = Settings.get_long_queue_name()
    elif queue_type == "short":
        queue_name = Settings.get_short_queue_name()
    else:
        print "Warning: Unknown queue type: %s" % (queue_type)
        queue_name = queue_type

    if queue_type is None:
        print "  - queue type: unspecified"
    else:
        print "  - queue type: %s" % (queue_type)
    if queue_name is None:
        print " - queue name unspecified"
    else:
        print " - queue name: %s" % (queue_name)

    misc_utils.make_dir(cluster_output_dir)
    if cluster_scripts_dir == None:
        cluster_scripts_dir = os.path.join(cluster_output_dir,
                                           'cluster_scripts')
        misc_utils.make_dir(cluster_scripts_dir)
    scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output')
    misc_utils.make_dir(scripts_output_dir)
    scripts_output_dir = os.path.abspath(scripts_output_dir)
    cluster_call = '%s -o \"%s\" -e \"%s\"' % (cmd_name, scripts_output_dir,
                                               scripts_output_dir)
    # Add queue type if given one
    if queue_name != None:
        cluster_call += ' -q \"%s\"' % (queue_name)

    script_name = \
        valid_cluster_name(os.path.join(cluster_scripts_dir,
                                     '%s_time_%s.sh' \
                                     %(job_name,
                                       time.strftime("%m-%d-%y_%H:%M:%S"))))
    make_bash_script(script_name, cmd)
    cluster_cmd = cluster_call + ' \"%s\"' % (script_name)
    job_id = launch_job(cluster_cmd, cmd_name)
    return job_id
Ejemplo n.º 9
0
    def run_on_cluster(self,
                       cmd,
                       job_name,
                       cluster_output_dir,
                       cluster_scripts_dir=None,
                       queue_type=None):
        '''
        Composes job script and launches job
        '''
        print "Submitting job: %s" % (job_name)
        queue_name = None

        # Load command name from settings file
        cmd_name = self.settings.get_cluster_command()

        if queue_type == "long":
            queue_name = self.settings.get_long_queue_name()
        elif queue_type == "short":
            queue_name = self.settings.get_short_queue_name()
        else:
            print "Warning: Unknown queue type: %s" % (queue_type)
            queue_name = queue_type

        if queue_type is None:
            print "  - queue type: unspecified"
        else:
            print "  - queue type: %s" % (queue_type)
        if queue_name is None:
            print " - queue name unspecified"
        else:
            print " - queue name: %s" % (queue_name)

        misc_utils.make_dir(cluster_output_dir)
        if cluster_scripts_dir == None:
            cluster_scripts_dir = os.path.join(cluster_output_dir,
                                               'cluster_scripts')
            misc_utils.make_dir(cluster_scripts_dir)
        scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output')
        misc_utils.make_dir(scripts_output_dir)
        scripts_output_dir = os.path.abspath(scripts_output_dir)
        cluster_call = 'bsub -o \"%s\" -e \"%s\"' % (scripts_output_dir,
                                                     scripts_output_dir)
        # Add queue type if given one
        if queue_name != None:
            cluster_call += ' -q \"%s\"' % (queue_name)

        script_name = os.path.join(cluster_scripts_dir,
                                         '%s_time_%s.sh' \
                                         %(job_name,
                                           time.strftime("%m-%d-%y_%H_%M_%S")))
        self.make_bash_script(script_name, cmd)
        cluster_cmd = cluster_call + ' \"%s\"' % (script_name)
        job_id = self.launch_job(cluster_cmd)
        return job_id
Ejemplo n.º 10
0
def run_on_cluster(
    cmd, job_name, cluster_output_dir, cluster_scripts_dir=None, queue_type=None, cmd_name="qsub", settings_fname=None
):
    print "Submitting job: %s" % (job_name)
    queue_name = None

    # Load command name from settings file
    if settings_fname != None:
        load_settings(settings_fname)
        cmd_name = Settings.get_cluster_command()

    if queue_type == "long":
        queue_name = Settings.get_long_queue_name()
    elif queue_type == "short":
        queue_name = Settings.get_short_queue_name()
    else:
        print "Warning: Unknown queue type: %s" % (queue_type)
        queue_name = queue_type

    if queue_type is None:
        print "  - queue type: unspecified"
    else:
        print "  - queue type: %s" % (queue_type)
    if queue_name is None:
        print " - queue name unspecified"
    else:
        print " - queue name: %s" % (queue_name)

    misc_utils.make_dir(cluster_output_dir)
    if cluster_scripts_dir == None:
        cluster_scripts_dir = os.path.join(cluster_output_dir, "cluster_scripts")
        misc_utils.make_dir(cluster_scripts_dir)
    scripts_output_dir = os.path.join(cluster_output_dir, "scripts_output")
    misc_utils.make_dir(scripts_output_dir)
    scripts_output_dir = os.path.abspath(scripts_output_dir)
    cluster_call = '%s -o "%s" -e "%s"' % (cmd_name, scripts_output_dir, scripts_output_dir)
    # Add queue type if given one
    if queue_name != None:
        cluster_call += ' -q "%s"' % (queue_name)

    script_name = valid_cluster_name(
        os.path.join(cluster_scripts_dir, "%s_time_%s.sh" % (job_name, time.strftime("%m-%d-%y_%H:%M:%S")))
    )
    make_bash_script(script_name, cmd)
    cluster_cmd = cluster_call + ' "%s"' % (script_name)
    job_id = launch_job(cluster_cmd, cmd_name)
    return job_id
Ejemplo n.º 11
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)

    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]

    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)

    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue

        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand,
                                      given_read_len=read_len)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Ejemplo n.º 12
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              main_logger,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.main_logger = main_logger
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         self.main_logger.error("BAM file %s not found." %(self.bam_filename))
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         self.main_logger.warning("Expected BAM index file %s not found." \
                             %(self.bam_index_fname))
         self.main_logger.warning("Are you sure your BAM file is indexed?")
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         self.main_logger.info("Using %d processors" %(num_proc))
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         self.main_logger.error("No genes to run on. Did you pass me the wrong path " \
                                "to your index GFF directory? " \
                                "Or perhaps your indexed GFF directory " \
                                "is empty?")
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
Ejemplo n.º 13
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")

    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
                      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
                      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
                      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
                      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."

    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
        sample2_dirname = os.path.abspath(options.samples_to_compare[1])
        output_dirname = os.path.abspath(options.samples_to_compare[2])
        if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
        ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)

    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
        samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
        assert(len(samples_label) >= 1)
        summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
        if not os.path.isdir(summary_output_dir):
            os.makedirs(summary_output_dir)

        summary_filename = os.path.join(summary_output_dir,
                                        '%s.miso_summary' %(samples_label))
        summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Ejemplo n.º 14
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")
    
    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
		      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
		      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
		      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
		      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
		      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")    
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file 
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."
            
    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
	sample2_dirname = os.path.abspath(options.samples_to_compare[1])
	output_dirname = os.path.abspath(options.samples_to_compare[2])
	if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
	ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)
        
    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
	samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
	assert(len(samples_label) >= 1)
	summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
	if not os.path.isdir(summary_output_dir):
	    os.makedirs(summary_output_dir)
	    
	summary_filename = os.path.join(summary_output_dir,
					'%s.miso_summary' %(samples_label))
	summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Ejemplo n.º 15
0
def compute_all_genes_psi(gff_dir, bam_filename, read_len,
                          output_dir, main_logger,
                          use_cluster=False,
                          SGEarray=False,
                          chunk_jobs=800,
                          overhang_len=1,
                          paired_end=None,
                          settings_fname=None,
                          job_name="misojob",
                          num_proc=None,
                          prefilter=False,
                          wait_on_jobs=True):
    """
    Compute Psi values for genes using a GFF and a BAM filename.

    SGE functionality contributed by Michael Lovci.

    Options:
    - prefilter: if set to True, prefilter events by coverage.
      Uses bedtools to determine coverage of each event and remove
      events that do not meet the coverage criteria from the run.
    """
    print "Computing Psi values..." 
    print "  - GFF index: %s" %(gff_dir)
    print "  - BAM: %s" %(bam_filename)
    print "  - Read length: %d" %(read_len)
    print "  - Output directory: %s" %(output_dir)

    misc_utils.make_dir(output_dir)

    # Check GFF and BAM for various errors like headers mismatch
    run_events.check_gff_and_bam(gff_dir, bam_filename, main_logger,
                                 given_read_len=read_len)
    
    # Prefilter events that do not meet the coverage criteria
    # If filtering is on, only run on events that meet
    # the filter.
    all_gene_ids = None
    
    if prefilter:
        main_logger.info("Prefiltering on")
        if misc_utils.which("bedtools") is None:
            main_logger.error("Error: Cannot use bedtools. Bedtools is " \
                              "required for --prefilter option")
            sys.exit(1)
        filtered_gene_ids = run_events.get_ids_passing_filter(gff_dir,
                                                              bam_filename,
                                                              output_dir)
        # Prefiltering succeeded, so process only gene ids that
        # pass the filter
        if filtered_gene_ids != None:
            num_pass = len(filtered_gene_ids)
            all_gene_ids = filtered_gene_ids
            # If none of the events meet the read coverage filter
            # something must have gone wrong, e.g. mismatch
            # in chromosome headers between BAM and GFF
            if num_pass == 0:
                main_logger.error("None of the events in %s appear to meet the " \
                                  "read coverage filter. Check that your BAM headers " \
                                  "in %s match the GFF headers of indexed events." \
                                  %(gff_dir,
                                    bam_filename))
                sys.exit(1)
            main_logger.info("Total of %d events pass coverage filter." \
                             %(num_pass))

    ##
    ## Submit jobs either using cluster or locally
    ## using multi-cores.
    ##
    dispatcher = GenesDispatcher(gff_dir,
                                 bam_filename,
                                 output_dir,
                                 read_len,
                                 overhang_len,
                                 main_logger,
                                 settings_fname=settings_fname,
                                 paired_end=paired_end,
                                 use_cluster=use_cluster,
                                 chunk_jobs=chunk_jobs,
                                 sge_job_name=job_name,
                                 SGEarray=SGEarray,
                                 gene_ids=all_gene_ids,
                                 num_proc=num_proc,
                                 wait_on_jobs=wait_on_jobs)
    dispatcher.run()
Ejemplo n.º 16
0
def run_SGEarray_cluster(arg_list,
                         argfile,
                         cluster_output_dir,
                         queue_type="long",
                         cluster_scripts_dir=None,
                         chunk=2500,
                         settings=None,
                         cmd_name="qsub",
                         job_name="miso_job"):
    """
    Run MISO jobs on cluster using SGE.

    Function contributed by Michael Lovci, UCSD.
    """
    misc_utils.make_dir(cluster_output_dir)
    # Create arguments file to pass on to job
    f = open(argfile, 'w')
    nargs = len(arg_list)
    if nargs % chunk == 0:
        njobs = nargs / chunk
    else:
        njobs = 1 + (nargs / chunk)

    for args in arg_list:
        f.write(args[0] + "\n")
    f.close()

    if cluster_scripts_dir == None:
        cluster_scripts_dir = os.path.join(cluster_output_dir,
                                           'cluster_scripts')
    misc_utils.make_dir(cluster_scripts_dir)
    scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output')
    misc_utils.make_dir(scripts_output_dir)
    scripts_output_dir = os.path.abspath(scripts_output_dir)
    script_error = os.path.join(scripts_output_dir,
                                string.join([job_name, "err"], "."))
    script_out = os.path.join(scripts_output_dir,
                              string.join([job_name, "out"], "."))
    cluster_script = os.path.join(cluster_scripts_dir, "run_miso.sh")

    if settings != None:
        load_settings(settings)
        cmd_name = Settings.get_cluster_command()

    if queue_type == "long":
        queue_name = Settings.get_long_queue_name()
    elif queue_type == "short":
        queue_name = Settings.get_short_queue_name()
    else:
        raise Exception, "Unknown queue type: %s" % (queue_type)

    if queue_type == None:
        print "  - queue: unspecified"
    else:
        print "  - queue: %s, using queue name %s" % (queue_type, queue_name)
    cs = open(cluster_script, 'w')
    cs.write("#!/bin/sh" + "\n")
    cs.write("#$ -N %s\n" % (job_name))
    cs.write("#$ -S /bin/sh\n")
    cs.write("#$ -p -1023\n")
    cs.write("#$ -o %s\n" % (script_out))
    cs.write("#$ -e %s\n" % (script_error))
    cs.write("#$ -t 1-%s\n" % (njobs))

    ##execute from current working directory
    cs.write("#$ -cwd\n")

    ## import environment variables
    cs.write("#$ -V\n")
    if queue_name:
        cs.write("#$ -l %s\n" % (queue_name))
    cs.write("echo \"hostname is:\"\n")
    cs.write("hostname\n")
    cs.write("ARGFILE=%s\n" % argfile)
    cs.write("SEQ=/usr/bin/seq\n")
    cs.write("index=0\n")
    cs.write("lastindex=0\n")
    cs.write("let \"index = $SGE_TASK_ID * %s\"\n" % (chunk))
    chunk2 = chunk - 1
    cs.write("let \"lastindex = $index - %s\"\n" % (chunk2))
    if chunk2 > 0:
        cs.write("for i in `$SEQ $lastindex $index`\n")
    else:
        cs.write("for i in $index\n")  # if user chooses 1 for chunk size
    cs.write("do\n")
    cs.write("  line=$(cat $ARGFILE | head -n $i | tail -n 1)\n")
    cs.write("  eval $line\n")
    cs.write("done\n")
    cs.close()

    # Make script executable
    os.system('chmod +x \"%s\"' % (cluster_script))
    qsub_cmd = cmd_name + ' \"%s\"' % (cluster_script)

    os.system(qsub_cmd)
Ejemplo n.º 17
0
def compute_psi(sample_filenames,
                output_dir,
                event_type,
                read_len,
                overhang_len,
                use_cluster=False,
                chunk_jobs=False,
                filter_events=True,
                events_info_filename=None,
                settings_filename=None):
    """
    Compute Psi values for skipped exons.  Sample filenames is a mapping from
    sample label to sample.

      - sample_filenames = [[sample_label1, sample_filename1],
                            [sample_label2, sample_filename2]]
      - output_dir: output directory
      - event_type: 'SE', 'RI', etc.
    """
    misc_utils.make_dir(output_dir)

    output_dir = os.path.join(output_dir, event_type)
    output_dir = os.path.abspath(output_dir)

    misc_utils.make_dir(output_dir)

    print "Computing Psi for events of type %s" % (event_type)
    print "  - samples used: ", sample_filenames.keys()

    for sample_label, sample_filename in sample_filenames.iteritems():
        print "Processing sample: label=%s, filename=%s" \
                   %(sample_label, sample_filename)
        results_output_dir = os.path.join(output_dir, sample_label)
        misc_utils.make_dir(results_output_dir)

        # Load the set of counts and serialize them into JSON
        events = \
                   as_events.load_event_counts(sample_filename,
                                               event_type,
                                               events_info_filename=events_info_filename)

        # Filter events
        if filter_events:
            print "Filtering events..."
            events.filter_events(settings=Settings.get())

        print "Running on a total of %d events." % (len(events.events))

        events_filename = events.output_file(results_output_dir, sample_label)

        # Run MISO on them
        miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \
                          "--read-len %d --overhang-len %d " \
                          %(os.path.join(miso_path, 'run_miso.py'),
                            events_filename,
                            results_output_dir,
                            event_type,
                            read_len,
                            overhang_len)
        if use_cluster:
            if chunk_jobs:
                miso_cmd += ' --use-cluster --chunk-jobs %d' % (chunk_jobs)
            else:
                miso_cmd += ' --use-cluster'
        print "Executing: %s" % (miso_cmd)
        if use_cluster:
            print " - Using cluster"
        os.system(miso_cmd)
Ejemplo n.º 18
0
def output_samples_comparison(sample1_dir, sample2_dir, output_dir,
                              alpha=.95,
                              sample_labels=None,
                              use_compressed=None):
    """
    Compute the bayes factors, posterior means, and other statistics
    between the two samples and output them to a directory.

    Expects two directories with samples from a MISO run, where corresponding
    events in the two samples' directories begin with the same event name.
    """
    print "Given output dir: %s" %(output_dir)
    print "Retrieving MISO files in sample directories..."
    sample1_obj = MISOSamples(sample1_dir,
                              use_compressed=use_compressed)
    sample2_obj = MISOSamples(sample2_dir,
                              use_compressed=use_compressed)
    print "Computing sample comparison between %s and %s..." %(sample1_dir,
                                                               sample2_dir)
    print "  - No. of events in %s: %d" %(sample1_dir, sample1_obj.num_events)
    print "  - No. of events in %s: %d" %(sample2_dir, sample2_obj.num_events)
    # Output header for Bayes factor file
    if sample_labels is None:
        # Use directory names as sample labels
        sample1_label = os.path.basename(os.path.normpath(sample1_dir))
        sample2_label = os.path.basename(os.path.normpath(sample2_dir))
    else:
        # If we're given sample labels, use them
        sample1_label, sample2_label = sample_labels
        print "Using user-given sample labels (sample1 = %s, sample2 = %s)" \
              %(sample1_label, sample2_label)
    output_dir = os.path.join(output_dir, "%s_vs_%s" %(sample1_label,
                                                       sample2_label))
    print "Creating comparisons parent directory: %s" %(output_dir)
    # Create parent directory for comparison
    misc_utils.make_dir(output_dir)
	
    # Create directory for Bayes factors
    bf_output_dir = os.path.join(output_dir, 'bayes-factors/')
    misc_utils.make_dir(bf_output_dir)
    
    header_fields = ['event_name',
                     'sample1_posterior_mean',
                     'sample1_ci_low',
                     'sample1_ci_high',
                     'sample2_posterior_mean',
                     'sample2_ci_low',
                     'sample2_ci_high',
                     'diff',
                     'bayes_factor',
                     'isoforms',
                     'sample1_counts',
                     'sample1_assigned_counts',
                     'sample2_counts',
                     'sample2_assigned_counts',
                     'chrom',
                     'strand',
                     'mRNA_starts',
                     'mRNA_ends']
    header_line = "\t".join(header_fields) + "\n"
    output_filename = \
        os.path.join(bf_output_dir, "%s_vs_%s.miso_bf" %(sample1_label,
                                                         sample2_label))
    output_file = open(output_filename, 'w')
    output_file.write(header_line)

    num_events_compared = 0
    file_num = 0

    # Compute the Bayes factors for each file
    for event_name in sample1_obj.all_event_names:
        sample1_results = sample1_obj.get_event_samples(event_name)
        # Parameters from raw MISO samples file
        samples1 = sample1_results[0]
        header1 = sample1_results[1]
        header1 = header1[0]
        params1 = parse_sampler_params_from_header(header1)
        # Extract gene information if available
        gene_info = get_gene_info_from_params(params1)
        # Find corresponding event filename in sample 2
        sample2_results = sample2_obj.get_event_samples(event_name)
        if sample2_results is None:
            continue
        num_events_compared += 1
        # Compute delta of posterior samples and Bayes factors
        diff_range = arange(-1, 1, 0.001)
        delta_densities = \
          compute_delta_densities(sample1_results,
                                  sample2_results,
                                  diff_range,
                                  event_name=event_name,
                                  sample1_label=sample1_label,
                                  sample2_label=sample2_label)
        bf = delta_densities['bayes_factor']
        num_isoforms = shape(delta_densities['samples1'])[1]
        sample1_posterior_mean = mean(delta_densities['samples1'], 0)
        sample2_posterior_mean = mean(delta_densities['samples2'], 0)
        # Get the labels of the isoforms
        isoforms_field = delta_densities['isoforms']
        # Get the counts information about both samples
        sample1_counts_info = delta_densities['sample1_counts']
        sample2_counts_info = delta_densities['sample2_counts']

        # Compute posterior mean and credible intervals for sample 1
        sample1_cred_intervals = \
          format_credible_intervals(event_name,
                                    delta_densities['samples1'],
                                    confidence_level=alpha)
        sample1_ci_low = sample1_cred_intervals[2]
        sample1_ci_high = sample1_cred_intervals[3]
        # Compute posterior mean and credible intervals for sample 2
        sample2_cred_intervals = \
          format_credible_intervals(event_name,
                                    delta_densities['samples2'],
                                    confidence_level=alpha)
        sample2_ci_low = sample2_cred_intervals[2]
        sample2_ci_high = sample2_cred_intervals[3]
        posterior_diff = sample1_posterior_mean - sample2_posterior_mean
        # Use precision of two decimal places
        if num_isoforms == 2:
            sample1_posterior_mean = \
                Decimal(str(sample1_posterior_mean[0])).quantize(Decimal('0.01'))
            sample2_posterior_mean = \
                Decimal(str(sample2_posterior_mean[0])).quantize(Decimal('0.01'))
            posterior_diff = "%.2f" %(sample1_posterior_mean - sample2_posterior_mean)
            bayes_factor = "%.2f" %(bf[0])
        else:
            posterior_diff = \
                ",".join(["%.2f" %(v) for v in (sample1_posterior_mean - sample2_posterior_mean)])
            sample1_posterior_mean = sample1_cred_intervals[1]
            sample2_posterior_mean = sample2_cred_intervals[1]
            bayes_factor = ",".join(["%.2f" %(max(v, 0)) for v in bf])

        # Write comparison output line
        output_fields = [event_name,
                         # Mean and confidence bounds for sample 1
                         "%s" %(sample1_posterior_mean),
                         "%s" %(sample1_ci_low),
                         "%s" %(sample1_ci_high),
                         # Mean and confidence bounds for sample 2
                         "%s" %(sample2_posterior_mean),
                         "%s" %(sample2_ci_low),
                         "%s" %(sample2_ci_high),
                         # Delta Psi value
                         "%s" %(posterior_diff),
                         # Bayes factor
                         "%s" %(bayes_factor),
                         # Description of the isoforms
                         "%s" %(isoforms_field),
                         # Counts information for sample 1
                         "%s" %(sample1_counts_info['counts']),
                         "%s" %(sample1_counts_info['assigned_counts']),
                         # Counts information for sample 2
                         "%s" %(sample2_counts_info['counts']),
                         "%s" %(sample2_counts_info['assigned_counts']),
                         # Gene information
                         gene_info["chrom"],
                         gene_info["strand"],
                         gene_info["mRNA_starts"],
                         gene_info["mRNA_ends"]]
        output_line = "%s\n" %("\t".join(output_fields))
        output_file.write(output_line)
    print "Compared a total of %d events." %(num_events_compared)
    output_file.close()
Ejemplo n.º 19
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)
        
    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return
    
    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    
    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]
        
    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)
    
    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue
        
        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Ejemplo n.º 20
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         print "Error: BAM file %s not found." %(self.bam_filename)
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         print "WARNING: Expected BAM index file %s not found." \
             %(self.bam_index_fname)
         print "Are you sure your BAM file is indexed?"
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         print "Using %d processors" %(num_proc)
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         print "Error: No genes to run on. Did you pass me the wrong path " \
               "to your index GFF directory? " \
               "Or perhaps your indexed GFF directory " \
               "is empty?"
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
Ejemplo n.º 21
0
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir,
                          use_cluster=False,
                          SGEarray=False,
                          chunk_jobs=800,
                          overhang_len=1,
                          paired_end=None,
                          settings_fname=None,
                          job_name="misojob",
                          num_proc=None,
                          prefilter=False,
                          wait_on_jobs=True):
    """
    Compute Psi values for genes using a GFF and a BAM filename.

    SGE functionality contributed by Michael Lovci.

    Options:
    - prefilter: if set to True, prefilter events by coverage.
      Uses bedtools to determine coverage of each event and remove
      events that do not meet the coverage criteria from the run.
    """
    print "Computing Psi values..." 
    print "  - GFF index: %s" %(gff_dir)
    print "  - BAM: %s" %(bam_filename)
    print "  - Read length: %d" %(read_len)
    print "  - Output directory: %s" %(output_dir)

    misc_utils.make_dir(output_dir)

    # Check GFF and BAM for various errors like headers mismatch
    run_events.check_gff_and_bam(gff_dir, bam_filename,
                                 given_read_len=read_len)
    
    # Prefilter events that do not meet the coverage criteria
    # If filtering is on, only run on events that meet
    # the filter.
    all_gene_ids = None
    
    if prefilter:
        print "  - Prefiltering on"
        if misc_utils.which("bedtools") is None:
            print "Error: Cannot use bedtools. Bedtools is " \
                  "required for --prefilter option"
            sys.exit(1)
        filtered_gene_ids = run_events.get_ids_passing_filter(gff_dir,
                                                              bam_filename,
                                                              output_dir)
        # Prefiltering succeeded, so process only gene ids that
        # pass the filter
        if filtered_gene_ids != None:
            num_pass = len(filtered_gene_ids)
            all_gene_ids = filtered_gene_ids
            # If none of the events meet the read coverage filter
            # something must have gone wrong, e.g. mismatch
            # in chromosome headers between BAM and GFF
            if num_pass == 0:
                print "Error: None of the events in %s appear to meet the " \
                      "read coverage filter. Check that your BAM headers " \
                      "in %s match the GFF headers of indexed events." \
                      %(gff_dir,
                        bam_filename)
                sys.exit(1)
            print "  - Total of %d events pass coverage filter." \
                %(num_pass)

    ##
    ## Submit jobs either using cluster or locally
    ## using multi-cores.
    ##
    dispatcher = GenesDispatcher(gff_dir,
                                 bam_filename,
                                 output_dir,
                                 read_len,
                                 overhang_len,
                                 settings_fname=settings_fname,
                                 paired_end=paired_end,
                                 use_cluster=use_cluster,
                                 chunk_jobs=chunk_jobs,
                                 sge_job_name=job_name,
                                 SGEarray=SGEarray,
                                 gene_ids=all_gene_ids,
                                 num_proc=num_proc,
                                 wait_on_jobs=wait_on_jobs)
    dispatcher.run()
Ejemplo n.º 22
0
def run_SGEarray_cluster(
    arg_list,
    argfile,
    cluster_output_dir,
    queue_type="long",
    cluster_scripts_dir=None,
    chunk=2500,
    settings=None,
    cmd_name="qsub",
    job_name="miso_job",
):
    """
    Run MISO jobs on cluster using SGE.

    Function contributed by Michael Lovci, UCSD.
    """
    misc_utils.make_dir(cluster_output_dir)
    # Create arguments file to pass on to job
    f = open(argfile, "w")
    nargs = len(arg_list)
    if nargs % chunk == 0:
        njobs = nargs / chunk
    else:
        njobs = 1 + (nargs / chunk)

    for args in arg_list:
        f.write(args[0] + "\n")
    f.close()

    if cluster_scripts_dir == None:
        cluster_scripts_dir = os.path.join(cluster_output_dir, "cluster_scripts")
    misc_utils.make_dir(cluster_scripts_dir)
    scripts_output_dir = os.path.join(cluster_output_dir, "scripts_output")
    misc_utils.make_dir(scripts_output_dir)
    scripts_output_dir = os.path.abspath(scripts_output_dir)
    script_error = os.path.join(scripts_output_dir, string.join([job_name, "err"], "."))
    script_out = os.path.join(scripts_output_dir, string.join([job_name, "out"], "."))
    cluster_script = os.path.join(cluster_scripts_dir, "run_miso.sh")

    if settings != None:
        load_settings(settings)
        cmd_name = Settings.get_cluster_command()

    if queue_type == "long":
        queue_name = Settings.get_long_queue_name()
    elif queue_type == "short":
        queue_name = Settings.get_short_queue_name()
    else:
        raise Exception, "Unknown queue type: %s" % (queue_type)

    if queue_type == None:
        print "  - queue: unspecified"
    else:
        print "  - queue: %s, using queue name %s" % (queue_type, queue_name)
    cs = open(cluster_script, "w")
    cs.write("#!/bin/sh" + "\n")
    cs.write("#$ -N %s\n" % (job_name))
    cs.write("#$ -S /bin/sh\n")
    cs.write("#$ -p -1023\n")
    cs.write("#$ -o %s\n" % (script_out))
    cs.write("#$ -e %s\n" % (script_error))
    cs.write("#$ -t 1-%s\n" % (njobs))

    ##execute from current working directory
    cs.write("#$ -cwd\n")

    ## import environment variables
    cs.write("#$ -V\n")
    if queue_name:
        cs.write("#$ -l %s\n" % (queue_name))
    cs.write('echo "hostname is:"\n')
    cs.write("hostname\n")
    cs.write("ARGFILE=%s\n" % argfile)
    cs.write("SEQ=/usr/bin/seq\n")
    cs.write("index=0\n")
    cs.write("lastindex=0\n")
    cs.write('let "index = $SGE_TASK_ID * %s"\n' % (chunk))
    chunk2 = chunk - 1
    cs.write('let "lastindex = $index - %s"\n' % (chunk2))
    if chunk2 > 0:
        cs.write("for i in `$SEQ $lastindex $index`\n")
    else:
        cs.write("for i in $index\n")  # if user chooses 1 for chunk size
    cs.write("do\n")
    cs.write("  line=$(cat $ARGFILE | head -n $i | tail -n 1)\n")
    cs.write("  eval $line\n")
    cs.write("done\n")
    cs.close()

    # Make script executable
    os.system('chmod +x "%s"' % (cluster_script))
    qsub_cmd = cmd_name + ' "%s"' % (cluster_script)

    os.system(qsub_cmd)
Ejemplo n.º 23
0
def compute_psi(sample_filenames, output_dir, event_type,
                read_len, overhang_len,
		use_cluster=False,
                chunk_jobs=False,
                filter_events=True,
                events_info_filename=None,
                settings_filename=None):
    """
    Compute Psi values for skipped exons.  Sample filenames is a mapping from
    sample label to sample.

      - sample_filenames = [[sample_label1, sample_filename1],
                            [sample_label2, sample_filename2]]
      - output_dir: output directory
      - event_type: 'SE', 'RI', etc.
    """
    misc_utils.make_dir(output_dir)
    
    output_dir = os.path.join(output_dir, event_type)
    output_dir = os.path.abspath(output_dir)

    misc_utils.make_dir(output_dir)
	
    print "Computing Psi for events of type %s" %(event_type)
    print "  - samples used: ", sample_filenames.keys()

    for sample_label, sample_filename in sample_filenames.iteritems():
	print "Processing sample: label=%s, filename=%s" \
            %(sample_label, sample_filename)
	results_output_dir = os.path.join(output_dir, sample_label)
        misc_utils.make_dir(results_output_dir)

	# Load the set of counts and serialize them into JSON
	events = \
            as_events.load_event_counts(sample_filename,
                                        event_type,
                                        events_info_filename=events_info_filename)

	# Filter events
	if filter_events:
	    print "Filtering events..."
	    events.filter_events(settings=Settings.get())

	print "Running on a total of %d events." %(len(events.events))
	    
	events_filename = events.output_file(results_output_dir,
                                             sample_label)
	
	# Run MISO on them
	miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \
                   "--read-len %d --overhang-len %d " \
                   %(os.path.join(miso_path, 'run_miso.py'),
                     events_filename,
                     results_output_dir,
                     event_type,
                     read_len,
                     overhang_len)
	if use_cluster:
	    if chunk_jobs:
		miso_cmd += ' --use-cluster --chunk-jobs %d' %(chunk_jobs)
	    else:
		miso_cmd += ' --use-cluster'
        print "Executing: %s" %(miso_cmd)
	if use_cluster:
	    print " - Using cluster"
	os.system(miso_cmd)
Ejemplo n.º 24
0
def output_samples_comparison(sample1_dir,
                              sample2_dir,
                              output_dir,
                              alpha=.95,
                              sample_labels=None,
                              use_compressed=None):
    """
    Compute the bayes factors, posterior means, and other statistics
    between the two samples and output them to a directory.

    Expects two directories with samples from a MISO run, where corresponding
    events in the two samples' directories begin with the same event name.
    """
    print "Given output dir: %s" % (output_dir)
    print "Retrieving MISO files in sample directories..."
    sample1_obj = MISOSamples(sample1_dir, use_compressed=use_compressed)
    sample2_obj = MISOSamples(sample2_dir, use_compressed=use_compressed)
    print "Computing sample comparison between %s and %s..." % (sample1_dir,
                                                                sample2_dir)
    print "  - No. of events in %s: %d" % (sample1_dir, sample1_obj.num_events)
    print "  - No. of events in %s: %d" % (sample2_dir, sample2_obj.num_events)
    # Output header for Bayes factor file
    if sample_labels is None:
        # Use directory names as sample labels
        sample1_label = os.path.basename(os.path.normpath(sample1_dir))
        sample2_label = os.path.basename(os.path.normpath(sample2_dir))
    else:
        # If we're given sample labels, use them
        sample1_label, sample2_label = sample_labels
        print "Using user-given sample labels (sample1 = %s, sample2 = %s)" \
              %(sample1_label, sample2_label)
    output_dir = os.path.join(output_dir,
                              "%s_vs_%s" % (sample1_label, sample2_label))
    print "Creating comparisons parent directory: %s" % (output_dir)
    # Create parent directory for comparison
    misc_utils.make_dir(output_dir)

    # Create directory for Bayes factors
    bf_output_dir = os.path.join(output_dir, 'bayes-factors/')
    misc_utils.make_dir(bf_output_dir)

    header_fields = [
        'event_name', 'sample1_posterior_mean', 'sample1_ci_low',
        'sample1_ci_high', 'sample2_posterior_mean', 'sample2_ci_low',
        'sample2_ci_high', 'diff', 'bayes_factor', 'isoforms',
        'sample1_counts', 'sample1_assigned_counts', 'sample2_counts',
        'sample2_assigned_counts', 'chrom', 'strand', 'mRNA_starts',
        'mRNA_ends'
    ]
    header_line = "\t".join(header_fields) + "\n"
    output_filename = \
        os.path.join(bf_output_dir, "%s_vs_%s.miso_bf" %(sample1_label,
                                                         sample2_label))
    output_file = open(output_filename, 'w')
    output_file.write(header_line)

    num_events_compared = 0
    file_num = 0

    # Compute the Bayes factors for each file
    for event_name in sample1_obj.all_event_names:
        sample1_results = sample1_obj.get_event_samples(event_name)
        # Parameters from raw MISO samples file
        samples1 = sample1_results[0]
        header1 = sample1_results[1]
        header1 = header1[0]
        params1 = parse_sampler_params_from_header(header1)
        # Extract gene information if available
        gene_info = get_gene_info_from_params(params1)
        # Find corresponding event filename in sample 2
        sample2_results = sample2_obj.get_event_samples(event_name)
        if sample2_results is None:
            continue
        num_events_compared += 1
        # Compute delta of posterior samples and Bayes factors
        diff_range = arange(-1, 1, 0.001)
        delta_densities = \
          compute_delta_densities(sample1_results,
                                  sample2_results,
                                  diff_range,
                                  event_name=event_name,
                                  sample1_label=sample1_label,
                                  sample2_label=sample2_label)
        bf = delta_densities['bayes_factor']
        num_isoforms = shape(delta_densities['samples1'])[1]
        sample1_posterior_mean = mean(delta_densities['samples1'], 0)
        sample2_posterior_mean = mean(delta_densities['samples2'], 0)
        # Get the labels of the isoforms
        isoforms_field = delta_densities['isoforms']
        # Get the counts information about both samples
        sample1_counts_info = delta_densities['sample1_counts']
        sample2_counts_info = delta_densities['sample2_counts']

        # Compute posterior mean and credible intervals for sample 1
        sample1_cred_intervals = \
          format_credible_intervals(event_name,
                                    delta_densities['samples1'],
                                    confidence_level=alpha)
        sample1_ci_low = sample1_cred_intervals[2]
        sample1_ci_high = sample1_cred_intervals[3]
        # Compute posterior mean and credible intervals for sample 2
        sample2_cred_intervals = \
          format_credible_intervals(event_name,
                                    delta_densities['samples2'],
                                    confidence_level=alpha)
        sample2_ci_low = sample2_cred_intervals[2]
        sample2_ci_high = sample2_cred_intervals[3]
        posterior_diff = sample1_posterior_mean - sample2_posterior_mean
        # Use precision of two decimal places
        if num_isoforms == 2:
            sample1_posterior_mean = \
                Decimal(str(sample1_posterior_mean[0])).quantize(Decimal('0.01'))
            sample2_posterior_mean = \
                Decimal(str(sample2_posterior_mean[0])).quantize(Decimal('0.01'))
            posterior_diff = "%.2f" % (sample1_posterior_mean -
                                       sample2_posterior_mean)
            bayes_factor = "%.2f" % (bf[0])
        else:
            posterior_diff = \
                ",".join(["%.2f" %(v) for v in (sample1_posterior_mean - sample2_posterior_mean)])
            sample1_posterior_mean = sample1_cred_intervals[1]
            sample2_posterior_mean = sample2_cred_intervals[1]
            bayes_factor = ",".join(["%.2f" % (max(v, 0)) for v in bf])

        # Write comparison output line
        output_fields = [
            event_name,
            # Mean and confidence bounds for sample 1
            "%s" % (sample1_posterior_mean),
            "%s" % (sample1_ci_low),
            "%s" % (sample1_ci_high),
            # Mean and confidence bounds for sample 2
            "%s" % (sample2_posterior_mean),
            "%s" % (sample2_ci_low),
            "%s" % (sample2_ci_high),
            # Delta Psi value
            "%s" % (posterior_diff),
            # Bayes factor
            "%s" % (bayes_factor),
            # Description of the isoforms
            "%s" % (isoforms_field),
            # Counts information for sample 1
            "%s" % (sample1_counts_info['counts']),
            "%s" % (sample1_counts_info['assigned_counts']),
            # Counts information for sample 2
            "%s" % (sample2_counts_info['counts']),
            "%s" % (sample2_counts_info['assigned_counts']),
            # Gene information
            gene_info["chrom"],
            gene_info["strand"],
            gene_info["mRNA_starts"],
            gene_info["mRNA_ends"]
        ]
        output_line = "%s\n" % ("\t".join(output_fields))
        output_file.write(output_line)
    print "Compared a total of %d events." % (num_events_compared)
    output_file.close()