Exemple #1
0
def parseGene(pickle_filename, event):
    """
    Parse a pickled gene.
    """
    if not os.path.isfile(pickle_filename):
        raise Exception, "Error: no filename %s" %(pickle_filename)
    gff_genes = gff_utils.load_indexed_gff_file(pickle_filename)

    if gff_genes == None:
        raise Exception, "Error: could not load genes from %s" \
              %(pickle_filename)

    exon_starts = []
    exon_ends = []
    mRNAs = []
    chrom = None
    mRNA_ids = []
    for gene_id, gene_info in gff_genes.iteritems():
        if event == gene_id:
            gene_obj = gene_info['gene_object']
            gene_hierarchy = gene_info['hierarchy']
            tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(\
                gene_hierarchy[gene_id])
            chrom = gene_obj.chrom

            for mRNA_id, mRNA_info in gene_hierarchy[gene_id]['mRNAs'].iteritems():
                mRNA = []
                mRNA_ids.append(mRNA_id)
                for exon_id, exon_info in gene_hierarchy[gene_id]['mRNAs']\
                    [mRNA_id]['exons'].\
                    iteritems():

                    exon_rec = gene_hierarchy[gene_id]['mRNAs']\
                        [mRNA_id]['exons'][exon_id]['record']
                    strand = exon_rec.strand
                    exon_starts.append(exon_rec.start)
                    exon_ends.append(exon_rec.end)
                    mRNA.append(sorted([exon_rec.start, exon_rec.end]))

                mRNAs.append(mRNA)
            break

    mRNAs.sort(key=len)
    return tx_start, tx_end, exon_starts, exon_ends, gene_obj, \
           mRNAs, strand, chrom, mRNA_ids
def parseGene(pickle_filename, event):
    """
    Parse a pickled gene.
    """
    if not os.path.isfile(pickle_filename):
        raise Exception, "Error: no filename %s" % (pickle_filename)
    gff_genes = gff_utils.load_indexed_gff_file(pickle_filename)

    if gff_genes == None:
        raise Exception, "Error: could not load genes from %s" \
              %(pickle_filename)

    exon_starts = []
    exon_ends = []
    mRNAs = []
    chrom = None
    for gene_id, gene_info in gff_genes.iteritems():
        if event == gene_id:
            gene_obj = gene_info['gene_object']
            gene_hierarchy = gene_info['hierarchy']
            tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(\
                gene_hierarchy[gene_id])
            chrom = gene_obj.chrom

            for mRNA_id, mRNA_info in gene_hierarchy[gene_id][
                    'mRNAs'].iteritems():
                mRNA = []
                for exon_id, exon_info in gene_hierarchy[gene_id]['mRNAs']\
                    [mRNA_id]['exons'].\
                    iteritems():

                    exon_rec = gene_hierarchy[gene_id]['mRNAs']\
                        [mRNA_id]['exons'][exon_id]['record']
                    strand = exon_rec.strand
                    exon_starts.append(exon_rec.start)
                    exon_ends.append(exon_rec.end)
                    mRNA.append(sorted([exon_rec.start, exon_rec.end]))

                mRNAs.append(mRNA)
            break

    mRNAs.sort(key=len)
    return tx_start, tx_end, exon_starts, exon_ends, gene_obj, \
           mRNAs, strand, chrom
Exemple #3
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)

    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]

    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)

    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue

        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand,
                                      given_read_len=read_len)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Exemple #4
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")

    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
                      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
                      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
                      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
                      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."

    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
        sample2_dirname = os.path.abspath(options.samples_to_compare[1])
        output_dirname = os.path.abspath(options.samples_to_compare[2])
        if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
        ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)

    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
        samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
        assert(len(samples_label) >= 1)
        summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
        if not os.path.isdir(summary_output_dir):
            os.makedirs(summary_output_dir)

        summary_filename = os.path.join(summary_output_dir,
                                        '%s.miso_summary' %(samples_label))
        summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Exemple #5
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run", dest="compute_genes_psi",
                      nargs=2, default=None,
                      help="Compute Psi values for a given GFF annotation "
                      "of either whole mRNA isoforms or isoforms produced by "
                      "single alternative splicing events. Expects two "
                      "arguments: an indexed GFF directory with genes to "
                      "process, and a sorted, indexed BAM file (with "
                      "headers) to run on.")
    parser.add_option("--event-type", dest="event_type", nargs=1,
                      help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)",
                      default=None)
    parser.add_option("--use-cluster", dest="use_cluster",
                      action="store_true", default=False,
                      help="Run events on cluster.")
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to chunk "
                      "events file into. Only applies when running on cluster.")
    parser.add_option("--no-filter-events", dest="no_filter_events",
                      action="store_true", default=False,
                      help="Do not filter events for computing Psi. "
                      "By default, MISO computes Psi only for events that "
                      "have a sufficient number of junction reads. "
                      "The default filter varies by event type.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),                    
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", default=None, type="int",
                      help="Length of sequenced reads.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode. Takes mean and "
                      "standard deviation of insert length distribution.")
    parser.add_option("--overhang-len", dest="overhang_len",
                      default=None, type="int",
                      help="Length of overhang constraints "
                      "imposed on junctions.")
    parser.add_option("--output-dir", dest="output_dir", default=None,
                      help="Directory for MISO output.")
    parser.add_option("--job-name", dest="job_name", nargs=1,
                      help="Name for jobs submitted to queue for SGE jobs. " \
                      "Default is misojob", default="misojob")
    parser.add_option("--SGEarray", dest="SGEarray",
                      action="store_true", default=False,
                      help="Use MISO on cluster with Sun Grid Engine. "
                      "To be used in conjunction with --use-cluster option.")
    parser.add_option("--prefilter", dest="prefilter", default=False,
                      action="store_true",
                      help="Prefilter events based on coverage. If given as " 
                      "argument, run will begin by mapping BAM reads to event "
                      "regions (using bedtools), and omit events that do not "
                      "meet coverage criteria from the run. By default, turned "
                      "off. Note that events that do not meet the coverage criteria "
                      "will not be processed regardless, but --prefilter simply "
                      "does this filtering step at the start of the run, potentially "
                      "saving computation time so that low coverage events will not "
                      "be processed or distributed to jobs if MISO is run on a "
                      "cluster. This options requires bedtools to be installed and "
                      "available on path.")
    parser.add_option("-p", dest="num_proc", default=None, nargs=1,
                      help="Number of processors to use. Only applies when running " \
                      "MISO on a single machine with multiple cores; does not apply " \
                      "to runs submitted to cluster with --use-cluster.")
    parser.add_option("--version", dest="version", default=False,
                      action="store_true",
                      help="Print MISO version.")
    parser.add_option("--no-wait", dest="no_wait", default=False,
                      action="store_true",
                      help="If passed in, do not wait on cluster jobs after " \
                      "they are submitted. By default, wait.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    greeting()

    if options.version:
        print "MISO version %s\n" %(misopy.__version__)

    ##
    ## Load the settings file 
    ##
    if not os.path.isdir(miso_settings_path):
        print "Error: %s is not a directory containing a default MISO " \
              "settings filename. Please specify a settings filename " \
              "using --settings-filename."
        return
    
    settings_filename = \
        os.path.abspath(os.path.expanduser(options.settings_filename))
    Settings.load(settings_filename)
    
    if (not options.use_cluster) and options.chunk_jobs:
        print "Error: Chunking jobs only applies when using " \
              "the --use-cluster option to run MISO on cluster."
        sys.exit(1)
    if (not options.use_cluster) and options.SGEarray:
        print "Error: SGEarray implies that you are using an SGE cluster," \
              "please run again with --use-cluster option enabled."
        sys.exit(1)

    ##
    ## Quantitation using BAM for all genes
    ##
    if options.compute_genes_psi != None:
        # GFF filename with genes to process
        gff_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[0]))

        # BAM filename with reads
        bam_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[1]))

        if options.output_dir == None:
            print "Error: need --output-dir to compute Psi values."
            sys.exit(1)

        # Output directory to use
        output_dir = os.path.abspath(os.path.expanduser(options.output_dir))

        ##
        ## Load the main logging object
        ##
        logs_output_dir = os.path.join(output_dir, "logs")
        main_logger = get_main_logger(logs_output_dir)

        if options.read_len == None:
            main_logger.error("need --read-len to compute Psi values.")
            sys.exit(1)

        overhang_len = 1

        if options.paired_end != None and options.overhang_len != None:
            main_logger.warning("cannot use --overhang-len in paired-end mode.\n" \
                                "Using overhang = 1")
        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Whether to wait on cluster jobs or not
        wait_on_jobs = not options.no_wait
        compute_all_genes_psi(gff_filename, bam_filename,
                              options.read_len, output_dir,
                              main_logger,
                              overhang_len=overhang_len,
                              use_cluster=options.use_cluster,
                              SGEarray=options.SGEarray,
                              job_name=options.job_name,
                              chunk_jobs=options.chunk_jobs,
                              paired_end=options.paired_end,
                              settings_fname=settings_filename,
                              prefilter=options.prefilter,
                              num_proc=options.num_proc,
                              wait_on_jobs=wait_on_jobs)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Exemple #6
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run", dest="compute_genes_psi",
                      nargs=2, default=None,
                      help="Compute Psi values for a given GFF annotation "
                      "of either whole mRNA isoforms or isoforms produced by "
                      "single alternative splicing events. Expects two "
                      "arguments: an indexed GFF directory with genes to "
                      "process, and a sorted, indexed BAM file (with "
                      "headers) to run on.")
    parser.add_option("--event-type", dest="event_type", nargs=1,
                      help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)",
                      default=None)
    parser.add_option("--use-cluster", dest="use_cluster",
                      action="store_true", default=False,
                      help="Run events on cluster.")
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to chunk "
                      "events file into. Only applies when running on cluster.")
    parser.add_option("--no-filter-events", dest="no_filter_events",
                      action="store_true", default=False,
                      help="Do not filter events for computing Psi. "
                      "By default, MISO computes Psi only for events that "
                      "have a sufficient number of junction reads. "
                      "The default filter varies by event type.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),                    
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", default=None, type="int",
                      help="Length of sequenced reads.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode. Takes mean and "
                      "standard deviation of insert length distribution.")
    parser.add_option("--overhang-len", dest="overhang_len",
                      default=None, type="int",
                      help="Length of overhang constraints "
                      "imposed on junctions.")
    parser.add_option("--output-dir", dest="output_dir", default=None,
                      help="Directory for MISO output.")
    parser.add_option("--job-name", dest="job_name", nargs=1,
                      help="Name for jobs submitted to queue for SGE jobs. " \
                      "Default is misojob", default="misojob")
    parser.add_option("--SGEarray", dest="SGEarray",
                      action="store_true", default=False,
                      help="Use MISO on cluster with Sun Grid Engine. "
                      "To be used in conjunction with --use-cluster option.")
    parser.add_option("--prefilter", dest="prefilter", default=False,
                      action="store_true",
                      help="Prefilter events based on coverage. If given as " 
                      "argument, run will begin by mapping BAM reads to event "
                      "regions (using bedtools), and omit events that do not "
                      "meet coverage criteria from the run. By default, turned "
                      "off. Note that events that do not meet the coverage criteria "
                      "will not be processed regardless, but --prefilter simply "
                      "does this filtering step at the start of the run, potentially "
                      "saving computation time so that low coverage events will not "
                      "be processed or distributed to jobs if MISO is run on a "
                      "cluster. This options requires bedtools to be installed and "
                      "available on path.")
    parser.add_option("-p", dest="num_proc", default=None, nargs=1,
                      help="Number of processors to use. Only applies when running " \
                      "MISO on a single machine with multiple cores; does not apply " \
                      "to runs submitted to cluster with --use-cluster.")
    parser.add_option("--version", dest="version", default=False,
                      action="store_true",
                      help="Print MISO version.")
    parser.add_option("--no-wait", dest="no_wait", default=False,
                      action="store_true",
                      help="If passed in, do not wait on cluster jobs after " \
                      "they are submitted. By default, wait.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    greeting()

    if options.version:
        print "MISO version %s\n" %(misopy.__version__)

    ##
    ## Load the settings file 
    ##
    if not os.path.isdir(miso_settings_path):
        print "Error: %s is not a directory containing a default MISO " \
              "settings filename. Please specify a settings filename " \
              "using --settings-filename."
        return
    
    settings_filename = \
        os.path.abspath(os.path.expanduser(options.settings_filename))
    Settings.load(settings_filename)
    
    if (not options.use_cluster) and options.chunk_jobs:
        print "Error: Chunking jobs only applies when using " \
              "the --use-cluster option to run MISO on cluster."
        sys.exit(1)
    if (not options.use_cluster) and options.SGEarray:
        print "Error: SGEarray implies that you are using an SGE cluster," \
              "please run again with --use-cluster option enabled."
        sys.exit(1)

    ##
    ## Quantitation using BAM for all genes
    ##
    if options.compute_genes_psi != None:
        # GFF filename with genes to process
        gff_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[0]))

        # BAM filename with reads
        bam_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[1]))

        if options.output_dir == None:
            print "Error: need --output-dir to compute Psi values."
            sys.exit(1)

        # Output directory to use
        output_dir = os.path.abspath(os.path.expanduser(options.output_dir))

        if options.read_len == None:
            print "Error: need --read-len to compute Psi values."
            sys.exit(1)

        overhang_len = 1

        if options.paired_end != None and options.overhang_len != None:
            print "WARNING: cannot use --overhang-len in paired-end mode."
            print "Using overhang = 1"

        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Whether to wait on cluster jobs or not
        wait_on_jobs = not options.no_wait
        compute_all_genes_psi(gff_filename, bam_filename,
                              options.read_len, output_dir,
                              overhang_len=overhang_len,
                              use_cluster=options.use_cluster,
                              SGEarray=options.SGEarray,
                              job_name=options.job_name,
                              chunk_jobs=options.chunk_jobs,
                              paired_end=options.paired_end,
                              settings_fname=settings_filename,
                              prefilter=options.prefilter,
                              num_proc=options.num_proc,
                              wait_on_jobs=wait_on_jobs)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Exemple #7
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)
        
    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return
    
    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    
    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]
        
    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)
    
    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue
        
        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Exemple #8
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")
    
    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
		      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
		      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
		      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
		      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
		      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")    
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file 
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."
            
    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
	sample2_dirname = os.path.abspath(options.samples_to_compare[1])
	output_dirname = os.path.abspath(options.samples_to_compare[2])
	if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
	ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)
        
    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
	samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
	assert(len(samples_label) >= 1)
	summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
	if not os.path.isdir(summary_output_dir):
	    os.makedirs(summary_output_dir)
	    
	summary_filename = os.path.join(summary_output_dir,
					'%s.miso_summary' %(samples_label))
	summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon