def check_module_availability(required_modules):
    unavailable_mods = 0
    print "Checking availability of Python modules for MISO"
    print "Looking for required Python modules.."
    for module_name in required_modules:
        print "Checking for availability of: %s" % (module_name)
        try:
            __import__(module_name)
            # Manually check for correct matplotlib version
            # required for sashimi_plot
            if module_name == "matplotlib":
                import matplotlib.pyplot as plt
                if not hasattr(plt, "subplot2grid"):
                    print "WARNING: subplot2grid function is not available in matplotlib. " \
                          "to use sashimi_plot, you must upgrade your matplotlib " \
                          "to version 1.1.0 or later. This function is *not* required " \
                          "for MISO use."
        except ImportError:
            print "  - Module %s not available!" % (module_name)
            if module_name == "matplotlib":
                print "matplotlib is required for sashimi_plot"
            unavailable_mods += 1
    if unavailable_mods != 0:
        print "Total of %d modules were not available. " \
              "Please install these and try again." %(unavailable_mods)
    else:
        print "All modules are available!"
    print "Looking for required executables.."
    required_programs = ["samtools", "bedtools"]
    for prog in required_programs:
        p = utils.which(prog)
        print "Checking if %s is available" % (prog)
        if p is None:
            print " - Cannot find %s!" % (prog)
            if prog == "bedtools":
                print "bedtools is only required for prefiltering " \
                      "and computation of insert lengths."
                if utils.which("tagBam"):
                    print "Your bedtools installation might be available " \
                          "but outdated. Please upgrade bedtools and " \
                          "ensure that \'bedtools\' is available on path."
        else:
            print "  - %s is available" % (prog)
    return unavailable_mods
Example #2
0
def check_module_availability(required_modules):
    unavailable_mods = 0
    print "Checking availability of Python modules for MISO"
    print "Looking for required Python modules.."
    for module_name in required_modules:
	print "Checking for availability of: %s" %(module_name)
	try:
	    __import__(module_name)
            # Manually check for correct matplotlib version
            # required for sashimi_plot
            if module_name == "matplotlib":
                import matplotlib.pyplot as plt
                if not hasattr(plt, "subplot2grid"):
                    print "WARNING: subplot2grid function is not available in matplotlib. " \
                          "to use sashimi_plot, you must upgrade your matplotlib " \
                          "to version 1.1.0 or later. This function is *not* required " \
                          "for MISO use."
	except ImportError:
	    print "  - Module %s not available!" %(module_name)
            if module_name == "matplotlib":
                print "matplotlib is required for sashimi_plot"
	    unavailable_mods += 1
    if unavailable_mods != 0:
        print "Total of %d modules were not available. " \
              "Please install these and try again." %(unavailable_mods)
    else:
        print "All modules are available!"
    print "Looking for required executables.."
    required_programs = ["samtools", "bedtools"]
    for prog in required_programs:
        p = utils.which(prog)
        print "Checking if %s is available" %(prog)
        if p is None:
            print " - Cannot find %s!" %(prog)
            if prog == "bedtools":
                print "bedtools is only required for prefiltering " \
                      "and computation of insert lengths."
                if utils.which("tagBam"):
                    print "Your bedtools installation might be available " \
                          "but outdated. Please upgrade bedtools and " \
                          "ensure that \'bedtools\' is available on path."
        else:
            print "  - %s is available" %(prog)
    return unavailable_mods
Example #3
0
def map_bam2gff(bam_filename, gff_filename,
                output_dir,
                interval_label="gff"):
    """
    Map BAM file against intervals in GFF, return results as BAM.

    Only keep hits that are in the interval.

    Uses tagBam utility from bedtools.
    """
    gff_basename = os.path.basename(gff_filename)
    bam_basename = os.path.basename(bam_filename)
    output_dir = os.path.join(output_dir, "bam2gff_%s" \
                              %(gff_basename))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_filename = os.path.join(output_dir, bam_basename)

    print "Mapping BAM to GFF..."
    print "  - BAM: %s" %(bam_filename)
    print "  - GFF: %s" %(gff_filename)
    print "  - Output file: %s" %(output_filename)

    if os.path.isfile(output_filename):
        print "WARNING: %s exists. Skipping.." \
              %(output_filename)
        return output_filename

    # "-intervals" option embeds the original GFF coordinates
    # in the output BAM file. Thanks to Aaron Quinlan for implementing
    # this helpful feature.
    print "Preparing to call bedtools \'tagBam\'"
    if misc_utils.which("tagBam") is None:
        print "Aborting operation.."
        sys.exit(1)
    tagBam_cmd = get_tagBam_cmd(bam_filename, interval_label,
                                gff_filename)
    # Write intervals as BAM
    tagBam_cmd += " | samtools view -Shb -o %s - " \
                  %(output_filename)
    print tagBam_cmd
    t1 = time.time()
    cmd_status = None
    try:
        cmd_status = subprocess.call(tagBam_cmd,
                                     stdout=subprocess.PIPE,
                                     shell=True)
    except OSError, e:
        if e.errno == errno.ENOENT:
            raise Exception, "Error: tagBam or one of the input filenames " \
                  "does not exist. Are you sure tagBam is on your PATH?"
def map_bam2gff(bam_filename, gff_filename, output_dir, interval_label="gff"):
    """
    Map BAM file against intervals in GFF, return results as BAM.

    Only keep hits that are in the interval.

    Uses tagBam utility from bedtools.
    """
    gff_basename = os.path.basename(gff_filename)
    bam_basename = os.path.basename(bam_filename)
    output_dir = os.path.join(output_dir, "bam2gff_%s" \
                              %(gff_basename))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_filename = os.path.join(output_dir, bam_basename)

    print "Mapping BAM to GFF..."
    print "  - BAM: %s" % (bam_filename)
    print "  - GFF: %s" % (gff_filename)
    print "  - Output file: %s" % (output_filename)

    if os.path.isfile(output_filename):
        print "WARNING: %s exists. Skipping.." \
              %(output_filename)
        return output_filename

    # "-intervals" option embeds the original GFF coordinates
    # in the output BAM file. Thanks to Aaron Quinlan for implementing
    # this helpful feature.
    print "Preparing to call bedtools \'tagBam\'"
    if misc_utils.which("tagBam") is None:
        print "Aborting operation.."
        sys.exit(1)
    tagBam_cmd = get_tagBam_cmd(bam_filename, interval_label, gff_filename)
    # Write intervals as BAM
    tagBam_cmd += " | samtools view -Shb -o %s - " \
                  %(output_filename)
    print tagBam_cmd
    t1 = time.time()
    cmd_status = None
    try:
        cmd_status = subprocess.call(tagBam_cmd,
                                     stdout=subprocess.PIPE,
                                     shell=True)
    except OSError, e:
        if e.errno == errno.ENOENT:
            raise Exception, "Error: tagBam or one of the input filenames " \
                  "does not exist. Are you sure tagBam is on your PATH?"
Example #5
0
def compute_all_genes_psi(gff_dir, bam_filename, read_len,
                          output_dir, main_logger,
                          use_cluster=False,
                          SGEarray=False,
                          chunk_jobs=800,
                          overhang_len=1,
                          paired_end=None,
                          settings_fname=None,
                          job_name="misojob",
                          num_proc=None,
                          prefilter=False,
                          wait_on_jobs=True):
    """
    Compute Psi values for genes using a GFF and a BAM filename.

    SGE functionality contributed by Michael Lovci.

    Options:
    - prefilter: if set to True, prefilter events by coverage.
      Uses bedtools to determine coverage of each event and remove
      events that do not meet the coverage criteria from the run.
    """
    print "Computing Psi values..." 
    print "  - GFF index: %s" %(gff_dir)
    print "  - BAM: %s" %(bam_filename)
    print "  - Read length: %d" %(read_len)
    print "  - Output directory: %s" %(output_dir)

    misc_utils.make_dir(output_dir)

    # Check GFF and BAM for various errors like headers mismatch
    run_events.check_gff_and_bam(gff_dir, bam_filename, main_logger,
                                 given_read_len=read_len)
    
    # Prefilter events that do not meet the coverage criteria
    # If filtering is on, only run on events that meet
    # the filter.
    all_gene_ids = None
    
    if prefilter:
        main_logger.info("Prefiltering on")
        if misc_utils.which("bedtools") is None:
            main_logger.error("Error: Cannot use bedtools. Bedtools is " \
                              "required for --prefilter option")
            sys.exit(1)
        filtered_gene_ids = run_events.get_ids_passing_filter(gff_dir,
                                                              bam_filename,
                                                              output_dir)
        # Prefiltering succeeded, so process only gene ids that
        # pass the filter
        if filtered_gene_ids != None:
            num_pass = len(filtered_gene_ids)
            all_gene_ids = filtered_gene_ids
            # If none of the events meet the read coverage filter
            # something must have gone wrong, e.g. mismatch
            # in chromosome headers between BAM and GFF
            if num_pass == 0:
                main_logger.error("None of the events in %s appear to meet the " \
                                  "read coverage filter. Check that your BAM headers " \
                                  "in %s match the GFF headers of indexed events." \
                                  %(gff_dir,
                                    bam_filename))
                sys.exit(1)
            main_logger.info("Total of %d events pass coverage filter." \
                             %(num_pass))

    ##
    ## Submit jobs either using cluster or locally
    ## using multi-cores.
    ##
    dispatcher = GenesDispatcher(gff_dir,
                                 bam_filename,
                                 output_dir,
                                 read_len,
                                 overhang_len,
                                 main_logger,
                                 settings_fname=settings_fname,
                                 paired_end=paired_end,
                                 use_cluster=use_cluster,
                                 chunk_jobs=chunk_jobs,
                                 sge_job_name=job_name,
                                 SGEarray=SGEarray,
                                 gene_ids=all_gene_ids,
                                 num_proc=num_proc,
                                 wait_on_jobs=wait_on_jobs)
    dispatcher.run()
Example #6
0
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir,
                          use_cluster=False,
                          SGEarray=False,
                          chunk_jobs=800,
                          overhang_len=1,
                          paired_end=None,
                          settings_fname=None,
                          job_name="misojob",
                          num_proc=None,
                          prefilter=False,
                          wait_on_jobs=True):
    """
    Compute Psi values for genes using a GFF and a BAM filename.

    SGE functionality contributed by Michael Lovci.

    Options:
    - prefilter: if set to True, prefilter events by coverage.
      Uses bedtools to determine coverage of each event and remove
      events that do not meet the coverage criteria from the run.
    """
    print "Computing Psi values..." 
    print "  - GFF index: %s" %(gff_dir)
    print "  - BAM: %s" %(bam_filename)
    print "  - Read length: %d" %(read_len)
    print "  - Output directory: %s" %(output_dir)

    misc_utils.make_dir(output_dir)

    # Check GFF and BAM for various errors like headers mismatch
    run_events.check_gff_and_bam(gff_dir, bam_filename,
                                 given_read_len=read_len)
    
    # Prefilter events that do not meet the coverage criteria
    # If filtering is on, only run on events that meet
    # the filter.
    all_gene_ids = None
    
    if prefilter:
        print "  - Prefiltering on"
        if misc_utils.which("bedtools") is None:
            print "Error: Cannot use bedtools. Bedtools is " \
                  "required for --prefilter option"
            sys.exit(1)
        filtered_gene_ids = run_events.get_ids_passing_filter(gff_dir,
                                                              bam_filename,
                                                              output_dir)
        # Prefiltering succeeded, so process only gene ids that
        # pass the filter
        if filtered_gene_ids != None:
            num_pass = len(filtered_gene_ids)
            all_gene_ids = filtered_gene_ids
            # If none of the events meet the read coverage filter
            # something must have gone wrong, e.g. mismatch
            # in chromosome headers between BAM and GFF
            if num_pass == 0:
                print "Error: None of the events in %s appear to meet the " \
                      "read coverage filter. Check that your BAM headers " \
                      "in %s match the GFF headers of indexed events." \
                      %(gff_dir,
                        bam_filename)
                sys.exit(1)
            print "  - Total of %d events pass coverage filter." \
                %(num_pass)

    ##
    ## Submit jobs either using cluster or locally
    ## using multi-cores.
    ##
    dispatcher = GenesDispatcher(gff_dir,
                                 bam_filename,
                                 output_dir,
                                 read_len,
                                 overhang_len,
                                 settings_fname=settings_fname,
                                 paired_end=paired_end,
                                 use_cluster=use_cluster,
                                 chunk_jobs=chunk_jobs,
                                 sge_job_name=job_name,
                                 SGEarray=SGEarray,
                                 gene_ids=all_gene_ids,
                                 num_proc=num_proc,
                                 wait_on_jobs=wait_on_jobs)
    dispatcher.run()