Esempio n. 1
0
def runMISOlocal(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f):
    """ Function to run MISO on a single bam file locally, i.e. do not copy to a node.

    Args:
        pickledDir (str/path): Directory pointing towards pickled MISO annotations. This database can be generated with the MISO -index flag
        bamFile (str/path): Directory containing sorted indexed bam file. *Please Note* Bam files must not be trimmed. MISO is not capable of processing mixed read lengths.
        readlen (int): Length of reads for bamFile
        overhanglen (int): The required number of nucleotides to overlap a splice junction to be considered in subsequent
        outDir (str/path): Directory where MISO results will be stored
        paired_end (bool): Paired-End mode. Currently MISO cannot handle paired-end data, this flag defaults to <False>
        settings_f (str/path): This file contains a list of flags to provide the cluster to allow for ease of job submission

    Returns:
        Nothing. Generates a directory <outDir> where pickled MISO events and PSI values are stored.

    """
    if paired_end == False or paired_end == 'False':
        paired_end = None
    Settings.load(settings_f)

    run_events_analysis.compute_all_genes_psi(
            pickledDir, bamFile, int(readlen), outdir,
            overhang_len=int(overhanglen),
            paired_end=paired_end, settings_fname=settings_f)
Esempio n. 2
0
def runMISOlocal(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f):

    if paired_end == False or paired_end == 'False':
        paired_end = None
    Settings.load(settings_f)

    #if not os.path.exists(outdir):
    #    print 'running', outdir
    run_events_analysis.compute_all_genes_psi(\
        pickledDir, bamFile, int(readlen), outdir,\
        overhang_len=int(overhanglen),\
        paired_end=paired_end, settings_fname=settings_f)
Esempio n. 3
0
def get_ids_passing_filter(gff_index_dir,
                           bam_filename,
                           output_dir):
    """
    Apply filter to events using bedtools and return
    only the events that meet the filter.
    """
    min_reads = 20
    settings = Settings.get()
    min_event_reads = Settings.get_min_event_reads()
    
    # Check that this was indexed with a version that outputs
    # genes.gff file
    genes_gff_fname = os.path.join(gff_index_dir,
                                   "genes.gff")
    if not os.path.isfile(genes_gff_fname):
        print "WARNING: Could not find \'genes.gff\' in %s - " \
              "skipping prefilter stage. Please reindex your " \
              "GFF file with the latest version to enable " \
              "prefiltering." %(gff_index_dir)
        return None
    print "Prefiltering reads..."
    coverage_fname = exon_utils.get_bam_gff_coverage(bam_filename,
                                                     genes_gff_fname,
                                                     output_dir)
    ids_passing_filter = []
    with open(coverage_fname) as coverage_in:
        for line in coverage_in:
            # Skip comments
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            # Get the counts field and the event ID
            # if it passes the filter
            counts = int(fields[9])
            if counts < min_event_reads:
                continue
            attribs = gff_utils.parse_gff_attribs(fields[8])
            if "ID" not in attribs:
                print "WARNING: No ID= found for line:\n%s\nSkipping..." \
                    %(line)
                continue
            event_id = attribs["ID"]
            ids_passing_filter.append(event_id)
    return ids_passing_filter
Esempio n. 4
0
def get_ids_passing_filter(gff_index_dir,
                           bam_filename,
                           output_dir):
    """
    Apply filter to events using bedtools and return
    only the events that meet the filter.
    """
    min_reads = 20
    settings = Settings.get()
    min_event_reads = Settings.get_min_event_reads()
    
    # Check that this was indexed with a version that outputs
    # genes.gff file
    genes_gff_fname = os.path.join(gff_index_dir,
                                   "genes.gff")
    if not os.path.isfile(genes_gff_fname):
        print "WARNING: Could not find \'genes.gff\' in %s - " \
              "skipping prefilter stage. Please reindex your " \
              "GFF file with the latest version to enable " \
              "prefiltering." %(gff_index_dir)
        return None
    print "Prefiltering reads..."
    coverage_fname = exon_utils.get_bam_gff_coverage(bam_filename,
                                                     genes_gff_fname,
                                                     output_dir)
    ids_passing_filter = []
    with open(coverage_fname) as coverage_in:
        for line in coverage_in:
            # Skip comments
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            # Get the counts field and the event ID
            # if it passes the filter
            counts = int(fields[9])
            if counts < min_event_reads:
                continue
            attribs = gff_utils.parse_gff_attribs(fields[8])
            if "ID" not in attribs:
                print "WARNING: No ID= found for line:\n%s\nSkipping..." \
                    %(line)
                continue
            event_id = attribs["ID"]
            ids_passing_filter.append(event_id)
    return ids_passing_filter
Esempio n. 5
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)

    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]

    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)

    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue

        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand,
                                      given_read_len=read_len)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Esempio n. 6
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")

    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
                      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
                      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
                      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
                      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."

    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
        sample2_dirname = os.path.abspath(options.samples_to_compare[1])
        output_dirname = os.path.abspath(options.samples_to_compare[2])
        if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
        ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)

    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
        samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
        assert(len(samples_label) >= 1)
        summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
        if not os.path.isdir(summary_output_dir):
            os.makedirs(summary_output_dir)

        summary_filename = os.path.join(summary_output_dir,
                                        '%s.miso_summary' %(samples_label))
        summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Esempio n. 7
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              main_logger,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.main_logger = main_logger
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         self.main_logger.error("BAM file %s not found." %(self.bam_filename))
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         self.main_logger.warning("Expected BAM index file %s not found." \
                             %(self.bam_index_fname))
         self.main_logger.warning("Are you sure your BAM file is indexed?")
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         self.main_logger.info("Using %d processors" %(num_proc))
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         self.main_logger.error("No genes to run on. Did you pass me the wrong path " \
                                "to your index GFF directory? " \
                                "Or perhaps your indexed GFF directory " \
                                "is empty?")
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
Esempio n. 8
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run", dest="compute_genes_psi",
                      nargs=2, default=None,
                      help="Compute Psi values for a given GFF annotation "
                      "of either whole mRNA isoforms or isoforms produced by "
                      "single alternative splicing events. Expects two "
                      "arguments: an indexed GFF directory with genes to "
                      "process, and a sorted, indexed BAM file (with "
                      "headers) to run on.")
    parser.add_option("--event-type", dest="event_type", nargs=1,
                      help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)",
                      default=None)
    parser.add_option("--use-cluster", dest="use_cluster",
                      action="store_true", default=False,
                      help="Run events on cluster.")
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to chunk "
                      "events file into. Only applies when running on cluster.")
    parser.add_option("--no-filter-events", dest="no_filter_events",
                      action="store_true", default=False,
                      help="Do not filter events for computing Psi. "
                      "By default, MISO computes Psi only for events that "
                      "have a sufficient number of junction reads. "
                      "The default filter varies by event type.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),                    
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", default=None, type="int",
                      help="Length of sequenced reads.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode. Takes mean and "
                      "standard deviation of insert length distribution.")
    parser.add_option("--overhang-len", dest="overhang_len",
                      default=None, type="int",
                      help="Length of overhang constraints "
                      "imposed on junctions.")
    parser.add_option("--output-dir", dest="output_dir", default=None,
                      help="Directory for MISO output.")
    parser.add_option("--job-name", dest="job_name", nargs=1,
                      help="Name for jobs submitted to queue for SGE jobs. " \
                      "Default is misojob", default="misojob")
    parser.add_option("--SGEarray", dest="SGEarray",
                      action="store_true", default=False,
                      help="Use MISO on cluster with Sun Grid Engine. "
                      "To be used in conjunction with --use-cluster option.")
    parser.add_option("--prefilter", dest="prefilter", default=False,
                      action="store_true",
                      help="Prefilter events based on coverage. If given as " 
                      "argument, run will begin by mapping BAM reads to event "
                      "regions (using bedtools), and omit events that do not "
                      "meet coverage criteria from the run. By default, turned "
                      "off. Note that events that do not meet the coverage criteria "
                      "will not be processed regardless, but --prefilter simply "
                      "does this filtering step at the start of the run, potentially "
                      "saving computation time so that low coverage events will not "
                      "be processed or distributed to jobs if MISO is run on a "
                      "cluster. This options requires bedtools to be installed and "
                      "available on path.")
    parser.add_option("-p", dest="num_proc", default=None, nargs=1,
                      help="Number of processors to use. Only applies when running " \
                      "MISO on a single machine with multiple cores; does not apply " \
                      "to runs submitted to cluster with --use-cluster.")
    parser.add_option("--version", dest="version", default=False,
                      action="store_true",
                      help="Print MISO version.")
    parser.add_option("--no-wait", dest="no_wait", default=False,
                      action="store_true",
                      help="If passed in, do not wait on cluster jobs after " \
                      "they are submitted. By default, wait.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    greeting()

    if options.version:
        print "MISO version %s\n" %(misopy.__version__)

    ##
    ## Load the settings file 
    ##
    if not os.path.isdir(miso_settings_path):
        print "Error: %s is not a directory containing a default MISO " \
              "settings filename. Please specify a settings filename " \
              "using --settings-filename."
        return
    
    settings_filename = \
        os.path.abspath(os.path.expanduser(options.settings_filename))
    Settings.load(settings_filename)
    
    if (not options.use_cluster) and options.chunk_jobs:
        print "Error: Chunking jobs only applies when using " \
              "the --use-cluster option to run MISO on cluster."
        sys.exit(1)
    if (not options.use_cluster) and options.SGEarray:
        print "Error: SGEarray implies that you are using an SGE cluster," \
              "please run again with --use-cluster option enabled."
        sys.exit(1)

    ##
    ## Quantitation using BAM for all genes
    ##
    if options.compute_genes_psi != None:
        # GFF filename with genes to process
        gff_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[0]))

        # BAM filename with reads
        bam_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[1]))

        if options.output_dir == None:
            print "Error: need --output-dir to compute Psi values."
            sys.exit(1)

        # Output directory to use
        output_dir = os.path.abspath(os.path.expanduser(options.output_dir))

        ##
        ## Load the main logging object
        ##
        logs_output_dir = os.path.join(output_dir, "logs")
        main_logger = get_main_logger(logs_output_dir)

        if options.read_len == None:
            main_logger.error("need --read-len to compute Psi values.")
            sys.exit(1)

        overhang_len = 1

        if options.paired_end != None and options.overhang_len != None:
            main_logger.warning("cannot use --overhang-len in paired-end mode.\n" \
                                "Using overhang = 1")
        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Whether to wait on cluster jobs or not
        wait_on_jobs = not options.no_wait
        compute_all_genes_psi(gff_filename, bam_filename,
                              options.read_len, output_dir,
                              main_logger,
                              overhang_len=overhang_len,
                              use_cluster=options.use_cluster,
                              SGEarray=options.SGEarray,
                              job_name=options.job_name,
                              chunk_jobs=options.chunk_jobs,
                              paired_end=options.paired_end,
                              settings_fname=settings_filename,
                              prefilter=options.prefilter,
                              num_proc=options.num_proc,
                              wait_on_jobs=wait_on_jobs)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Esempio n. 9
0
def compute_psi(sample_filenames,
                output_dir,
                event_type,
                read_len,
                overhang_len,
                use_cluster=False,
                chunk_jobs=False,
                filter_events=True,
                events_info_filename=None,
                settings_filename=None):
    """
    Compute Psi values for skipped exons.  Sample filenames is a mapping from
    sample label to sample.

      - sample_filenames = [[sample_label1, sample_filename1],
                            [sample_label2, sample_filename2]]
      - output_dir: output directory
      - event_type: 'SE', 'RI', etc.
    """
    misc_utils.make_dir(output_dir)

    output_dir = os.path.join(output_dir, event_type)
    output_dir = os.path.abspath(output_dir)

    misc_utils.make_dir(output_dir)

    print "Computing Psi for events of type %s" % (event_type)
    print "  - samples used: ", sample_filenames.keys()

    for sample_label, sample_filename in sample_filenames.iteritems():
        print "Processing sample: label=%s, filename=%s" \
                   %(sample_label, sample_filename)
        results_output_dir = os.path.join(output_dir, sample_label)
        misc_utils.make_dir(results_output_dir)

        # Load the set of counts and serialize them into JSON
        events = \
                   as_events.load_event_counts(sample_filename,
                                               event_type,
                                               events_info_filename=events_info_filename)

        # Filter events
        if filter_events:
            print "Filtering events..."
            events.filter_events(settings=Settings.get())

        print "Running on a total of %d events." % (len(events.events))

        events_filename = events.output_file(results_output_dir, sample_label)

        # Run MISO on them
        miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \
                          "--read-len %d --overhang-len %d " \
                          %(os.path.join(miso_path, 'run_miso.py'),
                            events_filename,
                            results_output_dir,
                            event_type,
                            read_len,
                            overhang_len)
        if use_cluster:
            if chunk_jobs:
                miso_cmd += ' --use-cluster --chunk-jobs %d' % (chunk_jobs)
            else:
                miso_cmd += ' --use-cluster'
        print "Executing: %s" % (miso_cmd)
        if use_cluster:
            print " - Using cluster"
        os.system(miso_cmd)
Esempio n. 10
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         print "Error: BAM file %s not found." %(self.bam_filename)
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         print "WARNING: Expected BAM index file %s not found." \
             %(self.bam_index_fname)
         print "Are you sure your BAM file is indexed?"
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         print "Using %d processors" %(num_proc)
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         print "Error: No genes to run on. Did you pass me the wrong path " \
               "to your index GFF directory? " \
               "Or perhaps your indexed GFF directory " \
               "is empty?"
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
Esempio n. 11
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run", dest="compute_genes_psi",
                      nargs=2, default=None,
                      help="Compute Psi values for a given GFF annotation "
                      "of either whole mRNA isoforms or isoforms produced by "
                      "single alternative splicing events. Expects two "
                      "arguments: an indexed GFF directory with genes to "
                      "process, and a sorted, indexed BAM file (with "
                      "headers) to run on.")
    parser.add_option("--event-type", dest="event_type", nargs=1,
                      help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)",
                      default=None)
    parser.add_option("--use-cluster", dest="use_cluster",
                      action="store_true", default=False,
                      help="Run events on cluster.")
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to chunk "
                      "events file into. Only applies when running on cluster.")
    parser.add_option("--no-filter-events", dest="no_filter_events",
                      action="store_true", default=False,
                      help="Do not filter events for computing Psi. "
                      "By default, MISO computes Psi only for events that "
                      "have a sufficient number of junction reads. "
                      "The default filter varies by event type.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),                    
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", default=None, type="int",
                      help="Length of sequenced reads.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode. Takes mean and "
                      "standard deviation of insert length distribution.")
    parser.add_option("--overhang-len", dest="overhang_len",
                      default=None, type="int",
                      help="Length of overhang constraints "
                      "imposed on junctions.")
    parser.add_option("--output-dir", dest="output_dir", default=None,
                      help="Directory for MISO output.")
    parser.add_option("--job-name", dest="job_name", nargs=1,
                      help="Name for jobs submitted to queue for SGE jobs. " \
                      "Default is misojob", default="misojob")
    parser.add_option("--SGEarray", dest="SGEarray",
                      action="store_true", default=False,
                      help="Use MISO on cluster with Sun Grid Engine. "
                      "To be used in conjunction with --use-cluster option.")
    parser.add_option("--prefilter", dest="prefilter", default=False,
                      action="store_true",
                      help="Prefilter events based on coverage. If given as " 
                      "argument, run will begin by mapping BAM reads to event "
                      "regions (using bedtools), and omit events that do not "
                      "meet coverage criteria from the run. By default, turned "
                      "off. Note that events that do not meet the coverage criteria "
                      "will not be processed regardless, but --prefilter simply "
                      "does this filtering step at the start of the run, potentially "
                      "saving computation time so that low coverage events will not "
                      "be processed or distributed to jobs if MISO is run on a "
                      "cluster. This options requires bedtools to be installed and "
                      "available on path.")
    parser.add_option("-p", dest="num_proc", default=None, nargs=1,
                      help="Number of processors to use. Only applies when running " \
                      "MISO on a single machine with multiple cores; does not apply " \
                      "to runs submitted to cluster with --use-cluster.")
    parser.add_option("--version", dest="version", default=False,
                      action="store_true",
                      help="Print MISO version.")
    parser.add_option("--no-wait", dest="no_wait", default=False,
                      action="store_true",
                      help="If passed in, do not wait on cluster jobs after " \
                      "they are submitted. By default, wait.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    greeting()

    if options.version:
        print "MISO version %s\n" %(misopy.__version__)

    ##
    ## Load the settings file 
    ##
    if not os.path.isdir(miso_settings_path):
        print "Error: %s is not a directory containing a default MISO " \
              "settings filename. Please specify a settings filename " \
              "using --settings-filename."
        return
    
    settings_filename = \
        os.path.abspath(os.path.expanduser(options.settings_filename))
    Settings.load(settings_filename)
    
    if (not options.use_cluster) and options.chunk_jobs:
        print "Error: Chunking jobs only applies when using " \
              "the --use-cluster option to run MISO on cluster."
        sys.exit(1)
    if (not options.use_cluster) and options.SGEarray:
        print "Error: SGEarray implies that you are using an SGE cluster," \
              "please run again with --use-cluster option enabled."
        sys.exit(1)

    ##
    ## Quantitation using BAM for all genes
    ##
    if options.compute_genes_psi != None:
        # GFF filename with genes to process
        gff_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[0]))

        # BAM filename with reads
        bam_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[1]))

        if options.output_dir == None:
            print "Error: need --output-dir to compute Psi values."
            sys.exit(1)

        # Output directory to use
        output_dir = os.path.abspath(os.path.expanduser(options.output_dir))

        if options.read_len == None:
            print "Error: need --read-len to compute Psi values."
            sys.exit(1)

        overhang_len = 1

        if options.paired_end != None and options.overhang_len != None:
            print "WARNING: cannot use --overhang-len in paired-end mode."
            print "Using overhang = 1"

        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Whether to wait on cluster jobs or not
        wait_on_jobs = not options.no_wait
        compute_all_genes_psi(gff_filename, bam_filename,
                              options.read_len, output_dir,
                              overhang_len=overhang_len,
                              use_cluster=options.use_cluster,
                              SGEarray=options.SGEarray,
                              job_name=options.job_name,
                              chunk_jobs=options.chunk_jobs,
                              paired_end=options.paired_end,
                              settings_fname=settings_filename,
                              prefilter=options.prefilter,
                              num_proc=options.num_proc,
                              wait_on_jobs=wait_on_jobs)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Esempio n. 12
0
def runMISOsingle(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f, scratchDir):
    """ Function to run MISO on a single bam file.

    Args:
        pickledDir (str/path): Directory pointing towards pickled MISO annotations. This database can be generated with the MISO -index flag
        bamFile (str/path): Directory containing sorted indexed bam file. *Please Note* Bam files must not be trimmed. MISO is not capable of processing mixed read lengths.
        readlen (int): Length of reads for bamFile
        overhanglen (int): The required number of nucleotides to overlap a splice junction to be considered in subsequent
        outDir (str/path): Directory where MISO results will be stored
        paired_end (bool): Paired-End mode. Currently MISO cannot handle paired-end data, this flag defaults to <False>
        settings_f (str/path): This file contains a list of flags to provide the cluster to allow for ease of job submission
        scratchDir (str/path): Directory where MISO output will be stored.

    Returns:
        Nothing. Generates a directory <outDir> where pickled MISO events and PSI values are stored.

    """
    if paired_end == 'False':
        paired_end = None

    t = str(time.time()) + str(random.random())

    print os.path.basename(pickledDir)
    if not os.path.exists(scratchDir):
        cmd = 'mkdir ' + scratchDir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
 
    # Copy pickled dir.
    pickled = os.path.join(scratchDir, os.path.basename(pickledDir) + \
        "." + t)
    cmd = 'mkdir ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -r ' + pickledDir + '/* ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()

    # Copy bam file. 
    cmd = 'cp -fL ' + bamFile + ' ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -fL ' + bamFile + '.bai ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    bam = os.path.join(scratchDir, os.path.basename(bamFile))

    # Give output directory in scratch a timestamp
    out = os.path.join(scratchDir, os.path.basename(outdir + "." + t))

    # LOAD SETTINGS FOR MISO
    Settings.load(settings_f)
 
    run_events_analysis.compute_all_genes_psi(\
        pickled, bam, int(readlen), out, overhang_len=int(overhanglen),\
        paired_end=paired_end, settings_fname=settings_f, prefilter=False)

    # Summarize sample
    #summary_fname = os.path.join(out, os.path.basename(outdir) + '.miso_summary') 
    #samples_utils.summarize_sampler_results(out, summary_fname)

    if not os.path.exists(outdir):
        cmd = 'mkdir -p ' + outdir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
        
    # Copy output back.
    cmd = 'cp -r ' + out + '/* ' + outdir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
 
    # Remove bam, output, and pickled dir. 
    cmd = 'rm ' + bam
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm ' + bam + '.bai'
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + out
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
Esempio n. 13
0
    def run(self, delay_constant=0.9):
        """
        Run batches either locally on multi-cores
        or using cluster.
        """
        batch_filenames = self.output_batch_files()
        # All MISO commands, each correspond to a batch,
        # and the number of jobs in each batch
        all_miso_cmds = []
        num_batches = len(batch_filenames)
        ##
        ## Prepare all the files necessary to run each batch
        ##
        print "Preparing to run %d batches of jobs..." % (num_batches)
        miso_run = os.path.join(miso_path, "run_miso.py")
        for batch_num, batch in enumerate(batch_filenames):
            batch_filename, batch_size = batch
            miso_cmd = \
              "python %s --compute-genes-from-file \"%s\" %s %s --read-len %d " \
                    %(miso_run,
                      batch_filename,
                      self.bam_filename,
                      self.output_dir,
                      self.read_len)
            # Add paired-end parameters and read len/overhang len
            if self.paired_end != None:
                # Run in paired-end mode
                frag_mean = float(self.paired_end[0])
                frag_sd = float(self.paired_end[1])
                miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd)
            else:
                # Overhang len only used in single-end mode
                miso_cmd += " --overhang-len %d" % (self.overhang_len)
            # Add settings filename if given
            if self.settings_fname != None:
                miso_cmd += " --settings-filename %s" \
                    %(self.settings_fname)
            all_miso_cmds.append((miso_cmd, batch_size))
        ##
        ## Run all MISO commands for the batches
        ## either locally using multi-cores or on cluster
        ##
        # First handle special case of SGE cluster submission
        if self.use_cluster and self.SGEarray:
            print "Using SGEarray..."
            # Call SGE
            batch_argfile = os.path.join(self.cluster_scripts_dir,
                                         "run_args.txt")
            cluster_utils.run_SGEarray_cluster(all_miso_cmds,
                                               batch_argfile,
                                               self.output_dir,
                                               settings=self.settings_fname,
                                               job_name=self.sge_job_name,
                                               chunk=self.chunk_jobs)
            # End SGE case
            return

        # All cluster jobs
        cluster_jobs = []
        for batch_num, cmd_info in enumerate(all_miso_cmds):
            miso_cmd, batch_size = cmd_info
            print "Running batch of %d genes.." % (batch_size)
            print "  - Executing: %s" % (miso_cmd)
            # Make a log file for the batch, where all the output
            # will be redirected
            time_str = time.strftime("%m-%d-%y_%H:%M:%S")
            batch_logfile = os.path.join(
                self.batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str))
            cmd_to_run = "%s >> \"%s\";" % (miso_cmd, batch_logfile)
            if not self.use_cluster:
                # Run locally
                p = subprocess.Popen(cmd_to_run, shell=True)
                thread_id = "batch-%d" % (batch_num)
                print "  - Submitted thread %s" % (thread_id)
                self.threads[thread_id] = p
            else:
                # Setup cluster engine
                Settings.load(self.settings_fname)
                clustercmd = Settings.get_cluster_command()

                self.cluster_engine = getClusterEngine(clustercmd,
                                                       self.settings_fname)

                # Run on cluster
                if batch_size >= self.long_thresh:
                    queue_type = "long"
                else:
                    queue_type = "short"
                # Run on cluster
                job_name = "gene_psi_batch_%d" % (batch_num)
                print "Submitting to cluster: %s" % (cmd_to_run)
                job_id = \
                    self.cluster_engine.run_on_cluster(cmd_to_run,
                                                 job_name,
                                                 self.output_dir,
                                                 queue_type=queue_type)
                if job_id is not None:
                    cluster_jobs.append(job_id)
                time.sleep(delay_constant)
            # Extra delay constant
            time.sleep(delay_constant)
        # If ran jobs on cluster, wait for them if there are any
        # to wait on.
        if self.wait_on_jobs:
            if self.use_cluster and (len(cluster_jobs) == 0):
                # If we're asked to use the cluster but the list
                # of cluster jobs is empty, it means we could not
                # find the IDs of the job from the submission
                # system. Report this to the user.
                self.main_logger.warning("Asked to wait on cluster jobs but cannot " \
                                         "parse their job IDs from the cluster submission " \
                                         "system.")
            # Try to wait on jobs no matter what; though if 'cluster_jobs'
            # is empty here, it will not wait
            self.cluster_engine.wait_on_jobs(cluster_jobs, self.cluster_cmd)
        else:
            if self.use_cluster:
                # If we're running in cluster mode and asked not
                # to wait for jobs, let user know
                self.main_logger.info("Not waiting on cluster jobs.")
        # If ran jobs locally, wait on them to finish
        # (this will do nothing if we submitted jobs to
        # cluster)
        self.wait_on_threads()
Esempio n. 14
0
def compute_psi(sample_filenames, output_dir, event_type,
                read_len, overhang_len,
		use_cluster=False,
                chunk_jobs=False,
                filter_events=True,
                events_info_filename=None,
                settings_filename=None):
    """
    Compute Psi values for skipped exons.  Sample filenames is a mapping from
    sample label to sample.

      - sample_filenames = [[sample_label1, sample_filename1],
                            [sample_label2, sample_filename2]]
      - output_dir: output directory
      - event_type: 'SE', 'RI', etc.
    """
    misc_utils.make_dir(output_dir)
    
    output_dir = os.path.join(output_dir, event_type)
    output_dir = os.path.abspath(output_dir)

    misc_utils.make_dir(output_dir)
	
    print "Computing Psi for events of type %s" %(event_type)
    print "  - samples used: ", sample_filenames.keys()

    for sample_label, sample_filename in sample_filenames.iteritems():
	print "Processing sample: label=%s, filename=%s" \
            %(sample_label, sample_filename)
	results_output_dir = os.path.join(output_dir, sample_label)
        misc_utils.make_dir(results_output_dir)

	# Load the set of counts and serialize them into JSON
	events = \
            as_events.load_event_counts(sample_filename,
                                        event_type,
                                        events_info_filename=events_info_filename)

	# Filter events
	if filter_events:
	    print "Filtering events..."
	    events.filter_events(settings=Settings.get())

	print "Running on a total of %d events." %(len(events.events))
	    
	events_filename = events.output_file(results_output_dir,
                                             sample_label)
	
	# Run MISO on them
	miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \
                   "--read-len %d --overhang-len %d " \
                   %(os.path.join(miso_path, 'run_miso.py'),
                     events_filename,
                     results_output_dir,
                     event_type,
                     read_len,
                     overhang_len)
	if use_cluster:
	    if chunk_jobs:
		miso_cmd += ' --use-cluster --chunk-jobs %d' %(chunk_jobs)
	    else:
		miso_cmd += ' --use-cluster'
        print "Executing: %s" %(miso_cmd)
	if use_cluster:
	    print " - Using cluster"
	os.system(miso_cmd)
Esempio n. 15
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)
        
    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return
    
    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    
    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]
        
    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)
    
    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue
        
        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Esempio n. 16
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")
    
    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
		      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
		      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
		      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
		      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
		      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")    
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file 
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."
            
    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
	sample2_dirname = os.path.abspath(options.samples_to_compare[1])
	output_dirname = os.path.abspath(options.samples_to_compare[2])
	if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
	ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)
        
    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
	samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
	assert(len(samples_label) >= 1)
	summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
	if not os.path.isdir(summary_output_dir):
	    os.makedirs(summary_output_dir)
	    
	summary_filename = os.path.join(summary_output_dir,
					'%s.miso_summary' %(samples_label))
	summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Esempio n. 17
0
def runMISOsingle(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f, scratchDir):

    if paired_end == 'False':
        paired_end = None

    t = str(time.time()) + str(random.random())

    print os.path.basename(pickledDir)
    if not os.path.exists(scratchDir):
        cmd = 'mkdir ' + scratchDir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
 
    # Copy pickled dir.
    pickled = os.path.join(scratchDir, os.path.basename(pickledDir) + \
        "." + t)
    cmd = 'mkdir ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -r ' + pickledDir + '/* ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()

    # Copy bam file. 
    cmd = 'cp -fL ' + bamFile + ' ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -fL ' + bamFile + '.bai ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    bam = os.path.join(scratchDir, os.path.basename(bamFile))

    # Give output directory in scratch a timestamp
    out = os.path.join(scratchDir, os.path.basename(outdir + "." + t))

    # LOAD SETTINGS FOR MISO
    Settings.load(settings_f)
 
    run_events_analysis.compute_all_genes_psi(\
        pickled, bam, int(readlen), out, overhang_len=int(overhanglen),\
        paired_end=paired_end, settings_fname=settings_f, prefilter=True)

    # Summarize sample
    #summary_fname = os.path.join(out, os.path.basename(outdir) + '.miso_summary') 
    #samples_utils.summarize_sampler_results(out, summary_fname)

    if not os.path.exists(outdir):
        cmd = 'mkdir -p ' + outdir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
        
    # Copy output back.
    cmd = 'cp -r ' + out + '/* ' + outdir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
 
    # Remove bam, output, and pickled dir. 
    cmd = 'rm ' + bam
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm ' + bam + '.bai'
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + out
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()