Example #1
0
def runMISOlocal(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f):
    """ Function to run MISO on a single bam file locally, i.e. do not copy to a node.

    Args:
        pickledDir (str/path): Directory pointing towards pickled MISO annotations. This database can be generated with the MISO -index flag
        bamFile (str/path): Directory containing sorted indexed bam file. *Please Note* Bam files must not be trimmed. MISO is not capable of processing mixed read lengths.
        readlen (int): Length of reads for bamFile
        overhanglen (int): The required number of nucleotides to overlap a splice junction to be considered in subsequent
        outDir (str/path): Directory where MISO results will be stored
        paired_end (bool): Paired-End mode. Currently MISO cannot handle paired-end data, this flag defaults to <False>
        settings_f (str/path): This file contains a list of flags to provide the cluster to allow for ease of job submission

    Returns:
        Nothing. Generates a directory <outDir> where pickled MISO events and PSI values are stored.

    """
    if paired_end == False or paired_end == 'False':
        paired_end = None
    Settings.load(settings_f)

    run_events_analysis.compute_all_genes_psi(
            pickledDir, bamFile, int(readlen), outdir,
            overhang_len=int(overhanglen),
            paired_end=paired_end, settings_fname=settings_f)
Example #2
0
def runMISOlocal(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f):

    if paired_end == False or paired_end == 'False':
        paired_end = None
    Settings.load(settings_f)

    #if not os.path.exists(outdir):
    #    print 'running', outdir
    run_events_analysis.compute_all_genes_psi(\
        pickledDir, bamFile, int(readlen), outdir,\
        overhang_len=int(overhanglen),\
        paired_end=paired_end, settings_fname=settings_f)
Example #3
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")

    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
                      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
                      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
                      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
                      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."

    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
        sample2_dirname = os.path.abspath(options.samples_to_compare[1])
        output_dirname = os.path.abspath(options.samples_to_compare[2])
        if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
        ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)

    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
        samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
        assert(len(samples_label) >= 1)
        summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
        if not os.path.isdir(summary_output_dir):
            os.makedirs(summary_output_dir)

        summary_filename = os.path.join(summary_output_dir,
                                        '%s.miso_summary' %(samples_label))
        summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Example #4
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run", dest="compute_genes_psi",
                      nargs=2, default=None,
                      help="Compute Psi values for a given GFF annotation "
                      "of either whole mRNA isoforms or isoforms produced by "
                      "single alternative splicing events. Expects two "
                      "arguments: an indexed GFF directory with genes to "
                      "process, and a sorted, indexed BAM file (with "
                      "headers) to run on.")
    parser.add_option("--event-type", dest="event_type", nargs=1,
                      help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)",
                      default=None)
    parser.add_option("--use-cluster", dest="use_cluster",
                      action="store_true", default=False,
                      help="Run events on cluster.")
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to chunk "
                      "events file into. Only applies when running on cluster.")
    parser.add_option("--no-filter-events", dest="no_filter_events",
                      action="store_true", default=False,
                      help="Do not filter events for computing Psi. "
                      "By default, MISO computes Psi only for events that "
                      "have a sufficient number of junction reads. "
                      "The default filter varies by event type.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),                    
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", default=None, type="int",
                      help="Length of sequenced reads.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode. Takes mean and "
                      "standard deviation of insert length distribution.")
    parser.add_option("--overhang-len", dest="overhang_len",
                      default=None, type="int",
                      help="Length of overhang constraints "
                      "imposed on junctions.")
    parser.add_option("--output-dir", dest="output_dir", default=None,
                      help="Directory for MISO output.")
    parser.add_option("--job-name", dest="job_name", nargs=1,
                      help="Name for jobs submitted to queue for SGE jobs. " \
                      "Default is misojob", default="misojob")
    parser.add_option("--SGEarray", dest="SGEarray",
                      action="store_true", default=False,
                      help="Use MISO on cluster with Sun Grid Engine. "
                      "To be used in conjunction with --use-cluster option.")
    parser.add_option("--prefilter", dest="prefilter", default=False,
                      action="store_true",
                      help="Prefilter events based on coverage. If given as " 
                      "argument, run will begin by mapping BAM reads to event "
                      "regions (using bedtools), and omit events that do not "
                      "meet coverage criteria from the run. By default, turned "
                      "off. Note that events that do not meet the coverage criteria "
                      "will not be processed regardless, but --prefilter simply "
                      "does this filtering step at the start of the run, potentially "
                      "saving computation time so that low coverage events will not "
                      "be processed or distributed to jobs if MISO is run on a "
                      "cluster. This options requires bedtools to be installed and "
                      "available on path.")
    parser.add_option("-p", dest="num_proc", default=None, nargs=1,
                      help="Number of processors to use. Only applies when running " \
                      "MISO on a single machine with multiple cores; does not apply " \
                      "to runs submitted to cluster with --use-cluster.")
    parser.add_option("--version", dest="version", default=False,
                      action="store_true",
                      help="Print MISO version.")
    parser.add_option("--no-wait", dest="no_wait", default=False,
                      action="store_true",
                      help="If passed in, do not wait on cluster jobs after " \
                      "they are submitted. By default, wait.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    greeting()

    if options.version:
        print "MISO version %s\n" %(misopy.__version__)

    ##
    ## Load the settings file 
    ##
    if not os.path.isdir(miso_settings_path):
        print "Error: %s is not a directory containing a default MISO " \
              "settings filename. Please specify a settings filename " \
              "using --settings-filename."
        return
    
    settings_filename = \
        os.path.abspath(os.path.expanduser(options.settings_filename))
    Settings.load(settings_filename)
    
    if (not options.use_cluster) and options.chunk_jobs:
        print "Error: Chunking jobs only applies when using " \
              "the --use-cluster option to run MISO on cluster."
        sys.exit(1)
    if (not options.use_cluster) and options.SGEarray:
        print "Error: SGEarray implies that you are using an SGE cluster," \
              "please run again with --use-cluster option enabled."
        sys.exit(1)

    ##
    ## Quantitation using BAM for all genes
    ##
    if options.compute_genes_psi != None:
        # GFF filename with genes to process
        gff_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[0]))

        # BAM filename with reads
        bam_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[1]))

        if options.output_dir == None:
            print "Error: need --output-dir to compute Psi values."
            sys.exit(1)

        # Output directory to use
        output_dir = os.path.abspath(os.path.expanduser(options.output_dir))

        ##
        ## Load the main logging object
        ##
        logs_output_dir = os.path.join(output_dir, "logs")
        main_logger = get_main_logger(logs_output_dir)

        if options.read_len == None:
            main_logger.error("need --read-len to compute Psi values.")
            sys.exit(1)

        overhang_len = 1

        if options.paired_end != None and options.overhang_len != None:
            main_logger.warning("cannot use --overhang-len in paired-end mode.\n" \
                                "Using overhang = 1")
        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Whether to wait on cluster jobs or not
        wait_on_jobs = not options.no_wait
        compute_all_genes_psi(gff_filename, bam_filename,
                              options.read_len, output_dir,
                              main_logger,
                              overhang_len=overhang_len,
                              use_cluster=options.use_cluster,
                              SGEarray=options.SGEarray,
                              job_name=options.job_name,
                              chunk_jobs=options.chunk_jobs,
                              paired_end=options.paired_end,
                              settings_fname=settings_filename,
                              prefilter=options.prefilter,
                              num_proc=options.num_proc,
                              wait_on_jobs=wait_on_jobs)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Example #5
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run", dest="compute_genes_psi",
                      nargs=2, default=None,
                      help="Compute Psi values for a given GFF annotation "
                      "of either whole mRNA isoforms or isoforms produced by "
                      "single alternative splicing events. Expects two "
                      "arguments: an indexed GFF directory with genes to "
                      "process, and a sorted, indexed BAM file (with "
                      "headers) to run on.")
    parser.add_option("--event-type", dest="event_type", nargs=1,
                      help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)",
                      default=None)
    parser.add_option("--use-cluster", dest="use_cluster",
                      action="store_true", default=False,
                      help="Run events on cluster.")
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
                      help="Size (in number of events) of each job to chunk "
                      "events file into. Only applies when running on cluster.")
    parser.add_option("--no-filter-events", dest="no_filter_events",
                      action="store_true", default=False,
                      help="Do not filter events for computing Psi. "
                      "By default, MISO computes Psi only for events that "
                      "have a sufficient number of junction reads. "
                      "The default filter varies by event type.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),                    
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", default=None, type="int",
                      help="Length of sequenced reads.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode. Takes mean and "
                      "standard deviation of insert length distribution.")
    parser.add_option("--overhang-len", dest="overhang_len",
                      default=None, type="int",
                      help="Length of overhang constraints "
                      "imposed on junctions.")
    parser.add_option("--output-dir", dest="output_dir", default=None,
                      help="Directory for MISO output.")
    parser.add_option("--job-name", dest="job_name", nargs=1,
                      help="Name for jobs submitted to queue for SGE jobs. " \
                      "Default is misojob", default="misojob")
    parser.add_option("--SGEarray", dest="SGEarray",
                      action="store_true", default=False,
                      help="Use MISO on cluster with Sun Grid Engine. "
                      "To be used in conjunction with --use-cluster option.")
    parser.add_option("--prefilter", dest="prefilter", default=False,
                      action="store_true",
                      help="Prefilter events based on coverage. If given as " 
                      "argument, run will begin by mapping BAM reads to event "
                      "regions (using bedtools), and omit events that do not "
                      "meet coverage criteria from the run. By default, turned "
                      "off. Note that events that do not meet the coverage criteria "
                      "will not be processed regardless, but --prefilter simply "
                      "does this filtering step at the start of the run, potentially "
                      "saving computation time so that low coverage events will not "
                      "be processed or distributed to jobs if MISO is run on a "
                      "cluster. This options requires bedtools to be installed and "
                      "available on path.")
    parser.add_option("-p", dest="num_proc", default=None, nargs=1,
                      help="Number of processors to use. Only applies when running " \
                      "MISO on a single machine with multiple cores; does not apply " \
                      "to runs submitted to cluster with --use-cluster.")
    parser.add_option("--version", dest="version", default=False,
                      action="store_true",
                      help="Print MISO version.")
    parser.add_option("--no-wait", dest="no_wait", default=False,
                      action="store_true",
                      help="If passed in, do not wait on cluster jobs after " \
                      "they are submitted. By default, wait.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    greeting()

    if options.version:
        print "MISO version %s\n" %(misopy.__version__)

    ##
    ## Load the settings file 
    ##
    if not os.path.isdir(miso_settings_path):
        print "Error: %s is not a directory containing a default MISO " \
              "settings filename. Please specify a settings filename " \
              "using --settings-filename."
        return
    
    settings_filename = \
        os.path.abspath(os.path.expanduser(options.settings_filename))
    Settings.load(settings_filename)
    
    if (not options.use_cluster) and options.chunk_jobs:
        print "Error: Chunking jobs only applies when using " \
              "the --use-cluster option to run MISO on cluster."
        sys.exit(1)
    if (not options.use_cluster) and options.SGEarray:
        print "Error: SGEarray implies that you are using an SGE cluster," \
              "please run again with --use-cluster option enabled."
        sys.exit(1)

    ##
    ## Quantitation using BAM for all genes
    ##
    if options.compute_genes_psi != None:
        # GFF filename with genes to process
        gff_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[0]))

        # BAM filename with reads
        bam_filename = \
            os.path.abspath(os.path.expanduser(options.compute_genes_psi[1]))

        if options.output_dir == None:
            print "Error: need --output-dir to compute Psi values."
            sys.exit(1)

        # Output directory to use
        output_dir = os.path.abspath(os.path.expanduser(options.output_dir))

        if options.read_len == None:
            print "Error: need --read-len to compute Psi values."
            sys.exit(1)

        overhang_len = 1

        if options.paired_end != None and options.overhang_len != None:
            print "WARNING: cannot use --overhang-len in paired-end mode."
            print "Using overhang = 1"

        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Whether to wait on cluster jobs or not
        wait_on_jobs = not options.no_wait
        compute_all_genes_psi(gff_filename, bam_filename,
                              options.read_len, output_dir,
                              overhang_len=overhang_len,
                              use_cluster=options.use_cluster,
                              SGEarray=options.SGEarray,
                              job_name=options.job_name,
                              chunk_jobs=options.chunk_jobs,
                              paired_end=options.paired_end,
                              settings_fname=settings_filename,
                              prefilter=options.prefilter,
                              num_proc=options.num_proc,
                              wait_on_jobs=wait_on_jobs)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Example #6
0
def runMISOsingle(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f, scratchDir):
    """ Function to run MISO on a single bam file.

    Args:
        pickledDir (str/path): Directory pointing towards pickled MISO annotations. This database can be generated with the MISO -index flag
        bamFile (str/path): Directory containing sorted indexed bam file. *Please Note* Bam files must not be trimmed. MISO is not capable of processing mixed read lengths.
        readlen (int): Length of reads for bamFile
        overhanglen (int): The required number of nucleotides to overlap a splice junction to be considered in subsequent
        outDir (str/path): Directory where MISO results will be stored
        paired_end (bool): Paired-End mode. Currently MISO cannot handle paired-end data, this flag defaults to <False>
        settings_f (str/path): This file contains a list of flags to provide the cluster to allow for ease of job submission
        scratchDir (str/path): Directory where MISO output will be stored.

    Returns:
        Nothing. Generates a directory <outDir> where pickled MISO events and PSI values are stored.

    """
    if paired_end == 'False':
        paired_end = None

    t = str(time.time()) + str(random.random())

    print os.path.basename(pickledDir)
    if not os.path.exists(scratchDir):
        cmd = 'mkdir ' + scratchDir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
 
    # Copy pickled dir.
    pickled = os.path.join(scratchDir, os.path.basename(pickledDir) + \
        "." + t)
    cmd = 'mkdir ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -r ' + pickledDir + '/* ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()

    # Copy bam file. 
    cmd = 'cp -fL ' + bamFile + ' ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -fL ' + bamFile + '.bai ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    bam = os.path.join(scratchDir, os.path.basename(bamFile))

    # Give output directory in scratch a timestamp
    out = os.path.join(scratchDir, os.path.basename(outdir + "." + t))

    # LOAD SETTINGS FOR MISO
    Settings.load(settings_f)
 
    run_events_analysis.compute_all_genes_psi(\
        pickled, bam, int(readlen), out, overhang_len=int(overhanglen),\
        paired_end=paired_end, settings_fname=settings_f, prefilter=False)

    # Summarize sample
    #summary_fname = os.path.join(out, os.path.basename(outdir) + '.miso_summary') 
    #samples_utils.summarize_sampler_results(out, summary_fname)

    if not os.path.exists(outdir):
        cmd = 'mkdir -p ' + outdir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
        
    # Copy output back.
    cmd = 'cp -r ' + out + '/* ' + outdir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
 
    # Remove bam, output, and pickled dir. 
    cmd = 'rm ' + bam
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm ' + bam + '.bai'
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + out
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
Example #7
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    ##
    ## Main options
    ##
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi",
                      nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene. "
                      "Expects four arguments: the first is a gene ID or set "
                      "of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene "
                      "information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, "
                      "and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end",
                      nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard "
                      "deviation for the fragment length distribution (assumed "
                      "to have discretized normal form.)")
    parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file",
                      nargs=3, default=None,
                      help="Runs on a set of genes from a file. Takes as input: "
                      "(1) a two-column tab-delimited file, where column 1 is the "
                      "event ID (ID field from GFF) and the second column is "
                      "the path to the indexed GFF file for that event. "
                      "MISO will run on all the events described in the file, "
                      "(2) a sorted, indexed BAM file to run on, and (3) a "
                      "directory to output results to.")
    
    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare",
                      nargs=3, default=None,
		      help="Compute comparison statistics between the two "
                      "given samples. Expects three directories: the first is "
                      "sample1's MISO output, the second is sample2's MISO "
                      "output, and the third is the directory where "
		      "results of the sample comparison will be outputted.")
    parser.add_option("--comparison-labels", dest="comparison_labels",
                      nargs=2, default=None,
                      help="Use these labels for the sample comparison "
                      "made by --compare-samples. "
                      "Takes two arguments: the label for sample 1 "
                      "and the label for sample 2, where sample 1 and "
                      "sample 2 correspond to the order of samples given "
                      "to --compare-samples.")
    parser.add_option("--summarize-samples", dest="summarize_samples",
                      nargs=2, default=None,
		      help="Compute summary statistics of the given set "
                      "of samples. Expects a directory with MISO output "
                      "and a directory to output summary file to.")
    parser.add_option("--summary-label", dest="summary_label",
                      nargs=1, default=None,
                      help="Label for MISO summary file. If not given, "
                      "uses basename of MISO output directory.")
    parser.add_option("--use-cluster", action="store_true",
                      dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs",
                      default=False, type="int",
		      help="Size (in number of events) of each job to "
                      "chunk events file into. Only applies when "
                      "running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_settings_path,
                                           "settings",
                                           "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int",
                      default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int",
                      default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
		      help="Event type of two-isoform "
                      "events (e.g. 'SE', 'RI', 'A3SS', ...)")    
    parser.add_option("--use-compressed", dest="use_compressed",
                      nargs=1, default=None,
                      help="Use compressed event IDs. Takes as input a "
                      "genes_to_filenames.shelve file produced by the "
                      "index_gff script.")
    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene",
                      nargs=1, default=None,
                      help="View the contents of a gene/event that has "
                      "been indexed. Takes as input an "
                      "indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    if options.compute_gene_psi is None:
        greeting()

    ##
    ## Load the settings file 
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    use_compressed = None
    if options.use_compressed is not None:
        use_compressed = \
            os.path.abspath(os.path.expanduser(options.use_compressed))
        if not os.path.exists(use_compressed):
            print "Error: mapping filename from event IDs to compressed IDs %s " \
                  "is not found." %(use_compressed)
            sys.exit(1)
        else:
            print "Compression being used."
            
    if options.samples_to_compare is not None:
        sample1_dirname = os.path.abspath(options.samples_to_compare[0])
	sample2_dirname = os.path.abspath(options.samples_to_compare[1])
	output_dirname = os.path.abspath(options.samples_to_compare[2])
	if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
            misc_utils.make_dir(output_dirname)
	ht.output_samples_comparison(sample1_dirname,
                                     sample2_dirname,
                                     output_dirname,
                                     sample_labels=options.comparison_labels,
                                     use_compressed=use_compressed)
    ##
    ## Main interface based on SAM files
    ##
    if options.compute_genes_from_file != None:
        # Run on events given by file
        run_compute_genes_from_file(options)
    if options.compute_gene_psi != None:
        run_compute_gene_psi(options)
        
    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
	samples_dir = \
            os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
        if options.summary_label != None:
            samples_label = options.summary_label
            print "Using summary label: %s" %(samples_label)
        else:
            samples_label = \
                os.path.basename(os.path.expanduser(samples_dir))
	assert(len(samples_label) >= 1)
	summary_output_dir = \
            os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
                                         'summary'))
	if not os.path.isdir(summary_output_dir):
	    os.makedirs(summary_output_dir)
	    
	summary_filename = os.path.join(summary_output_dir,
					'%s.miso_summary' %(samples_label))
	summarize_sampler_results(samples_dir, summary_filename,
                                  use_compressed=use_compressed)

    if options.view_gene != None:
        indexed_gene_filename = \
            os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            sys.exit(1)

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "mRNA IDs: "
            for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']:
                print "%s" %(mRNA_id)
            print "=="    
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Example #8
0
def runMISOsingle(pickledDir, bamFile, readlen, overhanglen, outdir,\
    paired_end, settings_f, scratchDir):

    if paired_end == 'False':
        paired_end = None

    t = str(time.time()) + str(random.random())

    print os.path.basename(pickledDir)
    if not os.path.exists(scratchDir):
        cmd = 'mkdir ' + scratchDir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
 
    # Copy pickled dir.
    pickled = os.path.join(scratchDir, os.path.basename(pickledDir) + \
        "." + t)
    cmd = 'mkdir ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -r ' + pickledDir + '/* ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()

    # Copy bam file. 
    cmd = 'cp -fL ' + bamFile + ' ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'cp -fL ' + bamFile + '.bai ' + scratchDir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    bam = os.path.join(scratchDir, os.path.basename(bamFile))

    # Give output directory in scratch a timestamp
    out = os.path.join(scratchDir, os.path.basename(outdir + "." + t))

    # LOAD SETTINGS FOR MISO
    Settings.load(settings_f)
 
    run_events_analysis.compute_all_genes_psi(\
        pickled, bam, int(readlen), out, overhang_len=int(overhanglen),\
        paired_end=paired_end, settings_fname=settings_f, prefilter=True)

    # Summarize sample
    #summary_fname = os.path.join(out, os.path.basename(outdir) + '.miso_summary') 
    #samples_utils.summarize_sampler_results(out, summary_fname)

    if not os.path.exists(outdir):
        cmd = 'mkdir -p ' + outdir
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
        
    # Copy output back.
    cmd = 'cp -r ' + out + '/* ' + outdir
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
 
    # Remove bam, output, and pickled dir. 
    cmd = 'rm ' + bam
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm ' + bam + '.bai'
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + out
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    cmd = 'rm -fr ' + pickled
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
Example #9
0
    def run(self, delay_constant=0.9):
        """
        Run batches either locally on multi-cores
        or using cluster.
        """
        batch_filenames = self.output_batch_files()
        # All MISO commands, each correspond to a batch,
        # and the number of jobs in each batch
        all_miso_cmds = []
        num_batches = len(batch_filenames)
        ##
        ## Prepare all the files necessary to run each batch
        ##
        print "Preparing to run %d batches of jobs..." % (num_batches)
        miso_run = os.path.join(miso_path, "run_miso.py")
        for batch_num, batch in enumerate(batch_filenames):
            batch_filename, batch_size = batch
            miso_cmd = \
              "python %s --compute-genes-from-file \"%s\" %s %s --read-len %d " \
                    %(miso_run,
                      batch_filename,
                      self.bam_filename,
                      self.output_dir,
                      self.read_len)
            # Add paired-end parameters and read len/overhang len
            if self.paired_end != None:
                # Run in paired-end mode
                frag_mean = float(self.paired_end[0])
                frag_sd = float(self.paired_end[1])
                miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd)
            else:
                # Overhang len only used in single-end mode
                miso_cmd += " --overhang-len %d" % (self.overhang_len)
            # Add settings filename if given
            if self.settings_fname != None:
                miso_cmd += " --settings-filename %s" \
                    %(self.settings_fname)
            all_miso_cmds.append((miso_cmd, batch_size))
        ##
        ## Run all MISO commands for the batches
        ## either locally using multi-cores or on cluster
        ##
        # First handle special case of SGE cluster submission
        if self.use_cluster and self.SGEarray:
            print "Using SGEarray..."
            # Call SGE
            batch_argfile = os.path.join(self.cluster_scripts_dir,
                                         "run_args.txt")
            cluster_utils.run_SGEarray_cluster(all_miso_cmds,
                                               batch_argfile,
                                               self.output_dir,
                                               settings=self.settings_fname,
                                               job_name=self.sge_job_name,
                                               chunk=self.chunk_jobs)
            # End SGE case
            return

        # All cluster jobs
        cluster_jobs = []
        for batch_num, cmd_info in enumerate(all_miso_cmds):
            miso_cmd, batch_size = cmd_info
            print "Running batch of %d genes.." % (batch_size)
            print "  - Executing: %s" % (miso_cmd)
            # Make a log file for the batch, where all the output
            # will be redirected
            time_str = time.strftime("%m-%d-%y_%H:%M:%S")
            batch_logfile = os.path.join(
                self.batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str))
            cmd_to_run = "%s >> \"%s\";" % (miso_cmd, batch_logfile)
            if not self.use_cluster:
                # Run locally
                p = subprocess.Popen(cmd_to_run, shell=True)
                thread_id = "batch-%d" % (batch_num)
                print "  - Submitted thread %s" % (thread_id)
                self.threads[thread_id] = p
            else:
                # Setup cluster engine
                Settings.load(self.settings_fname)
                clustercmd = Settings.get_cluster_command()

                self.cluster_engine = getClusterEngine(clustercmd,
                                                       self.settings_fname)

                # Run on cluster
                if batch_size >= self.long_thresh:
                    queue_type = "long"
                else:
                    queue_type = "short"
                # Run on cluster
                job_name = "gene_psi_batch_%d" % (batch_num)
                print "Submitting to cluster: %s" % (cmd_to_run)
                job_id = \
                    self.cluster_engine.run_on_cluster(cmd_to_run,
                                                 job_name,
                                                 self.output_dir,
                                                 queue_type=queue_type)
                if job_id is not None:
                    cluster_jobs.append(job_id)
                time.sleep(delay_constant)
            # Extra delay constant
            time.sleep(delay_constant)
        # If ran jobs on cluster, wait for them if there are any
        # to wait on.
        if self.wait_on_jobs:
            if self.use_cluster and (len(cluster_jobs) == 0):
                # If we're asked to use the cluster but the list
                # of cluster jobs is empty, it means we could not
                # find the IDs of the job from the submission
                # system. Report this to the user.
                self.main_logger.warning("Asked to wait on cluster jobs but cannot " \
                                         "parse their job IDs from the cluster submission " \
                                         "system.")
            # Try to wait on jobs no matter what; though if 'cluster_jobs'
            # is empty here, it will not wait
            self.cluster_engine.wait_on_jobs(cluster_jobs, self.cluster_cmd)
        else:
            if self.use_cluster:
                # If we're running in cluster mode and asked not
                # to wait for jobs, let user know
                self.main_logger.info("Not waiting on cluster jobs.")
        # If ran jobs locally, wait on them to finish
        # (this will do nothing if we submitted jobs to
        # cluster)
        self.wait_on_threads()