Ejemplo n.º 1
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1,
                      help="Create a database for input GFF filename. Takes a " \
                      "GFF filename.")
    parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None,
                      help="Output directory.")
#    parser.add_option("--gtf", dest="gtf", default=False, action="store_true",
#                      help="Output file as GTF. Default is GFF.")
    (options, args) = parser.parse_args()

    if options.output_dir is None:
        print "Error: need --output-dir to be provided.\n"
        greeting()
        sys.exit(1)

    output_dir = options.output_dir
    output_dir = utils.pathify(options.output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.input_gff is not None:
        gff_fname = utils.pathify(options.input_gff)
        if not os.path.isfile(gff_fname):
            print "Error: GFF file %s does not exist." %(gff_fname)
            sys.exit(1)
        create_db(gff_fname, output_dir)
Ejemplo n.º 2
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1,
                      help="Create a database for input GFF filename. Takes a " \
                      "GFF filename.")
    parser.add_option("--output-dir",
                      dest="output_dir",
                      nargs=1,
                      default=None,
                      help="Output directory.")
    #    parser.add_option("--gtf", dest="gtf", default=False, action="store_true",
    #                      help="Output file as GTF. Default is GFF.")
    (options, args) = parser.parse_args()

    if options.output_dir is None:
        print "Error: need --output-dir to be provided.\n"
        greeting()
        sys.exit(1)

    output_dir = options.output_dir
    output_dir = utils.pathify(options.output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.input_gff is not None:
        gff_fname = utils.pathify(options.input_gff)
        if not os.path.isfile(gff_fname):
            print "Error: GFF file %s does not exist." % (gff_fname)
            sys.exit(1)
        create_db(gff_fname, output_dir)
Ejemplo n.º 3
0
def compute_insert_lens(settings, output_dir, dry_run=False):
    """
    Compute insert lengths for all samples.
    """
    settings_filename = utils.pathify(settings)
    logs_outdir = utils.pathify(logs_outdir)
    misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="insert_lens")
    const_exons_gff = misowrap_obj.const_exons_gff
    if not os.path.isfile(const_exons_gff):
        print "Error: %s const exons GFF does not exist." % (const_exons_gff)
        sys.exit(1)
    pe_utils_path = misowrap_obj.pe_utils_cmd
    insert_len_output_dir = os.path.join(output_dir, "insert_lens")
    num_bams = len(misowrap_obj.bam_files)

    print "Computing insert lengths for %d files" % (num_bams)
    for bam_filename, sample_name in misowrap_obj.bam_files:
        print "Processing: %s" % (bam_filename)
        insert_len_cmd = "%s --compute-insert-len %s %s --output-dir %s" % (
            pe_utils_path,
            bam_filename,
            const_exons_gff,
            insert_len_output_dir,
        )
        print "Executing: %s" % (insert_len_cmd)
        job_name = "%s_insert_len" % (sample_name)
        if misowrap_obj.use_cluster:
            misowrap_obj.my_cluster.launch_job(insert_len_cmd, job_name, ppn=1)
        else:
            os.system(insert_len_cmd)
Ejemplo n.º 4
0
def make_miso_annotation(tables_dir, output_dir, org_build):
    """
    Make GFF annotation. Takes GFF tables directory
    and an output directory.

    Adapted from
    https://github.com/yarden/rnaseqlib/
    """
    tables_dir = utils.pathify(tables_dir)
    output_dir = utils.pathify(output_dir)
    print "Making GFF alternative events annotation..."
    print " - UCSC tables read from: %s" % (tables_dir)
    print " - Output dir: %s" % (output_dir)
    t1 = time.time()
    table_fnames = def_events.load_ucsc_tables(tables_dir)
    num_tables = len(table_fnames)
    if num_tables == 0:
        raise Exception("No UCSC tables found in %s." % (tables_dir))
    print "Loaded %d UCSC tables." % (num_tables)
    def_events.defineAllSplicing(tables_dir,
                                 output_dir,
                                 flanking="commonshortest",
                                 multi_iso=False,
                                 sanitize=False,
                                 genome_label=org_build)
    t2 = time.time()
    print "Took %.2f minutes to make the annotation." \
        % ((t2 - t1)/60.)
Ejemplo n.º 5
0
def make_miso_annotation(tables_dir, output_dir, org_build):
    """
    Make GFF annotation. Takes GFF tables directory
    and an output directory.

    Adapted from
    https://github.com/yarden/rnaseqlib/
    """
    tables_dir = utils.pathify(tables_dir)
    output_dir = utils.pathify(output_dir)
    print "Making GFF alternative events annotation..."
    print " - UCSC tables read from: %s" % (tables_dir)
    print " - Output dir: %s" % (output_dir)
    t1 = time.time()
    table_fnames = def_events.load_ucsc_tables(tables_dir)
    num_tables = len(table_fnames)
    if num_tables == 0:
        raise Exception("No UCSC tables found in %s." % (tables_dir))
    print "Loaded %d UCSC tables." % (num_tables)
    def_events.defineAllSplicing(tables_dir, output_dir,
                                 flanking="commonshortest",
                                 multi_iso=False,
                                 sanitize=False,
                                 genome_label=org_build)
    t2 = time.time()
    print "Took %.2f minutes to make the annotation." \
        % ((t2 - t1)/60.)
def make_annotation(args):
    """
    Make GFF annotation. Takes GFF tables directory
    and an output directory.
    """
    tables_dir = utils.pathify(args.tables_dir)
    output_dir = utils.pathify(args.output_dir)
    print "Making GFF alternative events annotation..."
    print "  - UCSC tables read from: %s" % (tables_dir)
    print "  - Output dir: %s" % (output_dir)
    t1 = time.time()
    table_fnames = def_events.load_ucsc_tables(tables_dir)
    num_tables = len(table_fnames)
    if num_tables == 0:
        raise Exception, "No UCSC tables found in %s." % (tables_dir)
    print "Loaded %d UCSC tables." % (num_tables)
    def_events.defineAllSplicing(tables_dir,
                                 output_dir,
                                 flanking=args.flanking_rule,
                                 multi_iso=args.multi_iso,
                                 genome_label=args.genome_label,
                                 sanitize=args.sanitize)
    t2 = time.time()
    print "Took %.2f minutes to make the annotation." \
          %((t2 - t1)/60.)
Ejemplo n.º 7
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run", dest="run", action="store_true",
                      default=False,
                      help="Run pipeline.")
    parser.add_option("--run-on-sample", dest="run_on_sample", nargs=1, default=None,
                      help="Run on a particular sample. Takes as input the sample label.")
    parser.add_option("--settings", dest="settings", nargs=1,
                      default=None,
                      help="Settings filename.")
    parser.add_option("--init", dest="initialize", nargs=1, default=None,
                      help="Initialize the pipeline. Takes as input a genome, "
                      "e.g. mm9 or hg18")
    parser.add_option("--output-dir", dest="output_dir", nargs=1,
                      default=None,
                      help="Output directory.")
    (options, args) = parser.parse_args()

    greeting()

    if options.output_dir == None:
        print "Error: need --output-dir"
        parser.print_help()
        sys.exit(1)
        
    output_dir = utils.pathify(options.output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    settings_filename = None
    if options.run:
        if options.settings == None:
            # Running of pipeline requires settings filename
            print "Error: need --settings"
            parser.print_help()
            sys.exit(1)
        settings_filename = utils.pathify(options.settings)
        run_pipeline(settings_filename,
                     output_dir)

    if options.run_on_sample is not None:
        if options.settings == None:
            # Running of pipeline requires settings filename
            print "Error: need --settings"
            parser.print_help()
            sys.exit(1)
        settings_filename = utils.pathify(options.settings)
        sample_label = options.run_on_sample
        run_on_sample(sample_label, settings_filename,
                      output_dir)

    if options.initialize is not None:
        genome = options.initialize
        initialize_pipeline(genome,
                            output_dir)
Ejemplo n.º 8
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
      
    parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1,
                      help="Fetch sequence from GFF events file. Takes as input: "
                      "GFF filename.")
    parser.add_option("--fi", dest="fasta_fname", default=None, nargs=1,
                      help="FASTA filename to fetch sequences from.")
    parser.add_option("--with-flanking-introns", dest="with_flanking_introns",
                      default=False, action="store_true",
                      help="Get sequence of flanking introns relative to skipped exon.")
    parser.add_option("--flanking-introns-coords", dest="flanking_introns_coords",
                      default=None, nargs=4,
                      help="Fetch the sequences of the flanking introns "
                      "(for SpliceGraph events). Takes as input the intervals to " 
                      "be used, which are: "
                      "(1) start position relative to 5 prime splice site of SE "
                      "(negative int), "
                      "(2) end position 5 prime splice site (negative int), "
                      "(3) start position relative to 3 prime splice site "
                      "(positive int), "
                      "(4) end position relative to 3 prime splice site. "
                      "(posiitve int). "
                      "Suggested settings are -250, -20, 20, -250.")
    parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None,
                      help="Output directory.")
    (options, args) = parser.parse_args()

    if options.output_dir is None:
        greeting()
        print "Error: need --output-dir to be provided."
        sys.exit(1)
        
    output_dir = options.output_dir
    output_dir = utils.pathify(options.output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.input_gff is not None:
        if options.fasta_fname is None:
            greeting()
            print "Error: Must provide input fasta file with --fi."
            sys.exit(1)
        # Check for FASTA
        gff_filename = utils.pathify(options.input_gff)
        fasta_fname = utils.pathify(options.fasta_fname)
        flanking_introns_coords = options.flanking_introns_coords
        gffutils_helpers.fetch_seq_from_gff(gff_filename, fasta_fname, output_dir,
                                            with_flanking_introns=options.with_flanking_introns,
                                            flanking_introns_coords=options.flanking_introns_coords)
Ejemplo n.º 9
0
 def load_summaries(self, miso_samples_dir):
     """
     Load MISO summary files.
     """
     miso_samples_dir = utils.pathify(miso_samples_dir)
     print "Loading summary files.."
     summaries_dict = defaultdict(dict)
     for sample in self.sample_labels:
         for event_type in self.event_types:
             sample_name, label = sample
             print "SAMPLE NAME: ", sample_name
             print " SAMPLE LABEL: ", label
             sample_dir = os.path.join(miso_samples_dir, sample_name,
                                       event_type)
             if not os.path.isdir(sample_dir):
                 print "WARNING: Skipping %s..." \
                     %(sample_dir)
                 continue
             summary_filename = get_summary_filename(sample_dir)
             if not os.path.isfile(summary_filename):
                 print "WARNING: %s not a summary file" \
                     %(summary_filename)
                 continue
             summary_df = pandas.read_table(summary_filename,
                                            sep=self.delimiter)
             summaries_dict[event_type][sample_name] = summary_df
     self.summaries_df = pandas.DataFrame(summaries_dict)
Ejemplo n.º 10
0
 def load_events_to_genes(self, delimiter="\t"):
     """
     Load mapping from events to genes. Use the new GFF files
     for this.
     """
     basename_card = "*.gff3"
     events_to_genes_dir = None
     if "events_to_genes" in self.settings_info["settings"]:
         events_to_genes_dir = \
             utils.pathify(self.settings_info["settings"]["events_to_genes_dir"])
     else:
         return
     gff_fnames = \
         glob.glob(os.path.join(events_to_genes_dir, basename_card))
     print "Loading events to genes mapping..."
     print "  - Input directory: %s" %(events_to_genes_dir)
     print "  - Number of files: %d" %(len(gff_fnames))
     self.events_to_genes = defaultdict(lambda: defaultdict(str))
     for fname in gff_fnames:
         event_type = os.path.basename(fname).split(".")[0]
         gff_entries = pybedtools.BedTool(fname)
         gene_entries = gff_entries.filter(lambda x: x.fields[2] == "gene")
         for gene in gene_entries:
             # Parse Ensembl gene, RefSeq and gene symbols
             attrs = gene.attrs
             self.events_to_genes[event_type][attrs["ID"]] = \
                 {"ensg_id": attrs["ensg_id"],
                  "refseq_id": attrs["refseq_id"],
                  "gsymbol": attrs["gsymbol"]}
Ejemplo n.º 11
0
def get_default_db_fname(gff_fname, db_dirname="gff_db"):
    """
    Look for canonical GFF database filename. If exists,
    return its path, otherwise return None.

    Looks for that has 'gff_fname's basename ending in .db
    inside a 'gff_db' subdirectory in the same
    directory where 'gff_fname' is stored.

    For example, if 'gff_fname' is /home/user/mygff.gff it will
    look for /home/user/gff_db/mygff.gff.db.
    """
    gff_fname = utils.pathify(gff_fname)
    # If the input ends in .db, assume it is the database
    if gff_fname.endswith(".db"):
        return gff_fname
    gff_basename = os.path.basename(gff_fname)
    gff_db_dir = os.path.join(os.path.dirname(gff_fname),
                              db_dirname)
    if not os.path.isdir(gff_db_dir):
        return None
    db_fname = os.path.join(gff_db_dir, "%s.db" %(gff_basename))
    if not os.path.isfile(db_fname):
        return None
    return db_fname
Ejemplo n.º 12
0
 def load_summaries(self, miso_samples_dir):
     """
     Load MISO summary files.
     """
     miso_samples_dir = utils.pathify(miso_samples_dir)
     print "Loading summary files.."
     summaries_dict = defaultdict(dict)
     for sample in self.sample_labels:
         for event_type in self.event_types:
             sample_name, label = sample
             print "SAMPLE NAME: ", sample_name
             print " SAMPLE LABEL: ", label
             sample_dir = os.path.join(miso_samples_dir,
                                       sample_name,
                                       event_type)
             if not os.path.isdir(sample_dir):
                 print "WARNING: Skipping %s..." \
                     %(sample_dir)
                 continue
             summary_filename = get_summary_filename(sample_dir)
             if not os.path.isfile(summary_filename):
                 print "WARNING: %s not a summary file" \
                     %(summary_filename)
                 continue
             summary_df = pandas.read_table(summary_filename,
                                            sep=self.delimiter)
             summaries_dict[event_type][sample_name] = summary_df
     self.summaries_df = pandas.DataFrame(summaries_dict)
Ejemplo n.º 13
0
 def load_events_to_genes(self, delimiter="\t"):
     """
     Load mapping from events to genes. Use the new GFF files
     for this.
     """
     basename_card = "*.gff3"
     events_to_genes_dir = None
     if "events_to_genes" in self.settings_info["settings"]:
         events_to_genes_dir = \
             utils.pathify(self.settings_info["settings"]["events_to_genes_dir"])
     else:
         return
     gff_fnames = \
         glob.glob(os.path.join(events_to_genes_dir, basename_card))
     print "Loading events to genes mapping..."
     print "  - Input directory: %s" % (events_to_genes_dir)
     print "  - Number of files: %d" % (len(gff_fnames))
     self.events_to_genes = defaultdict(lambda: defaultdict(str))
     for fname in gff_fnames:
         event_type = os.path.basename(fname).split(".")[0]
         gff_entries = pybedtools.BedTool(fname)
         gene_entries = gff_entries.filter(lambda x: x.fields[2] == "gene")
         for gene in gene_entries:
             # Parse Ensembl gene, RefSeq and gene symbols
             attrs = gene.attrs
             self.events_to_genes[event_type][attrs["ID"]] = \
                 {"ensg_id": attrs["ensg_id"],
                  "refseq_id": attrs["refseq_id"],
                  "gsymbol": attrs["gsymbol"]}
Ejemplo n.º 14
0
def summarize(settings,
              logs_outdir,
              delay=5,
              dry_run=False):
    """
    Summarize samples in MISO directory.
    """
    settings_filename = utils.pathify(settings)
    misowrap_obj = mw.MISOWrap(settings_filename,
                               logs_outdir,
                               logger_label="summarize")
    bam_files = misowrap_obj.bam_files
    sample_labels = misowrap_obj.sample_labels
    print "Summarizing MISO output..."
    for sample_label in sample_labels:
        sample_basename = sample_label[0]
        sample_dir_path = \
            utils.pathify(os.path.join(misowrap_obj.miso_outdir,
                                       sample_basename))
        print "Processing: %s" %(sample_basename)
        if not os.path.isdir(sample_dir_path):
            print "Skipping non-directory: %s" %(sample_dir_path)
        # List all event directories in the sample
        event_dirs = os.listdir(sample_dir_path)
        for event_dirname in event_dirs:
            event_dir_path = utils.pathify(os.path.join(sample_dir_path,
                                                        event_dirname))
            if not os.path.isdir(event_dir_path):
                print "Skipping non-dir: %s" %(event_dir_path)
            print "Processing event type: %s" %(event_dirname)
            summary_cmd = \
                "%s --summarize-samples %s %s --summary-label %s" \
                %(misowrap_obj.summarize_miso_cmd,
                  event_dir_path,
                  event_dir_path,
                  sample_basename)
            job_name = "summarize_%s_%s" %(sample_basename,
                                           os.path.basename(event_dirname))
            print "Executing: %s" %(summary_cmd)
            if misowrap_obj.use_cluster:
                if not dry_run:
                    misowrap_obj.my_cluster.launch_job(summary_cmd,
                                                       job_name,
                                                       ppn=1)
            else:
                if not dry_run:
                    os.system(summary_cmd)
Ejemplo n.º 15
0
def get_event_types_dirs(settings_info):
    """
    Return event types.
    """
    miso_events_dir = \
      utils.pathify(settings_info["settings"]["miso_events_dir"])
    event_types_dirs = [os.path.join(miso_events_dir, dirname) \
                        for dirname in os.listdir(miso_events_dir)]
    return event_types_dirs
Ejemplo n.º 16
0
def get_event_types_dirs(settings_info):
    """
    Return event types.
    """
    miso_events_dir = \
      utils.pathify(settings_info["settings"]["miso_events_dir"])
    event_types_dirs = [os.path.join(miso_events_dir, dirname) \
                        for dirname in os.listdir(miso_events_dir)]
    return event_types_dirs
Ejemplo n.º 17
0
def filter(settings, logs_outdir, dry_run=False):
    """
    Output a set of filtered MISO comparisons.
    """
    settings_filename = utils.pathify(settings)
    misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="filter")
    misowrap_obj.logger.info("Filtering MISO events...")
    psi_table = pt.PsiTable(misowrap_obj)
    psi_table.output_filtered_comparisons()
Ejemplo n.º 18
0
def summarize_miso_samples(settings_filename,
                           output_dir):
    """
    Summarize samples in MISO directory.
    """
    misowrap_obj = MISOWrap(settings_filename,
                            output_dir,
                            logger_label="summarize")
    bam_files = misowrap_obj.bam_files
    sample_labels = misowrap_obj.sample_labels
    print "Summarizing MISO output..."
    print "  - Output dir: %s" %(output_dir)
    run_miso_cmd = misowrap_obj.run_miso_cmd
    for sample_label in sample_labels:
        print "sample label: ", sample_label
        sample_basename = sample_label[0]
        sample_dir_path = \
            utils.pathify(os.path.join(misowrap_obj.miso_outdir,
                                       sample_basename))
        print "Processing: %s" %(sample_basename)
        if not os.path.isdir(sample_dir_path):
            print "Skipping non-directory: %s" %(sample_dir_path)
        # List all event directories in the sample
        event_dirs = os.listdir(sample_dir_path)
        for event_dirname in event_dirs:
            event_dir_path = utils.pathify(os.path.join(sample_dir_path,
                                                        event_dirname))
            if not os.path.isdir(event_dir_path):
                print "Skipping non-dir: %s" %(event_dir_path)
            print "Processing event type: %s" %(event_dirname)
            summary_cmd = \
                "%s --summarize-samples %s %s --summary-label %s" \
                %(run_miso_cmd,
                  event_dir_path,
                  event_dir_path,
                  sample_basename)
            job_name = "summarize_%s_%s" %(sample_basename,
                                           os.path.basename(event_dirname))
            print "Executing: %s" %(summary_cmd)
            if misowrap_obj.use_cluster:
                misowrap_obj.my_cluster.launch_job(summary_cmd,
                                                   job_name)
            else:
                os.system(summary_cmd)
Ejemplo n.º 19
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--input-gff",
                      dest="input_gff",
                      default=None,
                      nargs=1,
                      help="Input GFF filename for a GFF database.")
    parser.add_option("--output-dir",
                      dest="output_dir",
                      nargs=1,
                      default=None,
                      help="Output directory.")
    parser.add_option("--gtf",
                      dest="gtf",
                      default=False,
                      action="store_true",
                      help="Output a GTF file instead of GFF.")
    parser.add_option("--db-subdir", dest="db_subdir", default="gff_db",
                      help="Name of output subdirectory containing GFF " \
                      "database. By default, creates \'gff_db\' " \
                      "subdirectory in the directory given to --output-dir.")
    parser.add_option("--no-db-output",
                      dest="no_db_output",
                      default=False,
                      action="store_true",
                      help="Do not output a GFF database.")

    (options, args) = parser.parse_args()

    if options.output_dir is None:
        print "Error: need --output-dir to be provided.\n"
        greeting()
        sys.exit(1)

    output_dir = options.output_dir
    output_dir = utils.pathify(options.output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.input_gff is not None:
        gff_fname = utils.pathify(options.input_gff)
        sanitize_gff(gff_fname, output_dir)
Ejemplo n.º 20
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1,
                      help="Extract lengths from GFF file.")
    parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None,
                      help="Output directory.")
    (options, args) = parser.parse_args()

    if options.output_dir is None:
        print "Error: need --output-dir to be provided.\n"
        greeting()
        sys.exit(1)

    output_dir = options.output_dir
    output_dir = utils.pathify(options.output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.input_gff is not None:
        gff_fname = utils.pathify(options.input_gff)
        extract_lens_from_gff(gff_fname, output_dir)
Ejemplo n.º 21
0
def get_output_db_fname(gff_fname, output_dir,
                        db_subdir="gff_db"):
    """
    Return output file for 'gff_fname' (either a GFF db or
    a regular GFF file) in 'output_dir'
    """
    gff_fname = utils.pathify(gff_fname)
    gff_basename = os.path.basename(gff_fname)
    db_fname = \
        os.path.join(output_dir, db_subdir, gff_basename)
    if not db_fname.endswith(".db"):
        db_fname += ".db"
    return db_fname
Ejemplo n.º 22
0
def filter(settings,
           logs_outdir,
           dry_run=False):
    """
    Output a set of filtered MISO comparisons.
    """
    settings_filename = utils.pathify(settings)
    misowrap_obj = mw.MISOWrap(settings_filename,
                               logs_outdir,
                               logger_label="filter")
    misowrap_obj.logger.info("Filtering MISO events...")
    psi_table = pt.PsiTable(misowrap_obj)
    psi_table.output_filtered_comparisons()
Ejemplo n.º 23
0
def compare(settings, logs_outdir, delay=5, dry_run=False):
    """
    Run a MISO samples comparison between all pairs of samples.
    """
    settings_filename = utils.pathify(settings)
    misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="compare")
    bam_files = misowrap_obj.bam_files
    sample_labels = misowrap_obj.sample_labels
    read_len = misowrap_obj.read_len
    overhang_len = misowrap_obj.overhang_len
    miso_bin_dir = misowrap_obj.miso_bin_dir
    miso_output_dir = misowrap_obj.miso_outdir
    comparison_groups = misowrap_obj.comparison_groups
    comparisons_dir = misowrap_obj.comparisons_dir
    utils.make_dir(comparisons_dir)
    misowrap_obj.logger.info("Running MISO comparisons...")
    ##
    ## Compute comparisons between all pairs
    ## in a sample group
    ##
    for comp_group in comparison_groups:
        sample_pairs = utils.get_pairwise_comparisons(comp_group)
        print "  - Total of %d comparisons" % (len(sample_pairs))
        for sample1, sample2 in sample_pairs:
            # For each pair of samples, compare their output
            # along each event type
            misowrap_obj.logger.info("Comparing %s %s" % (sample1, sample2))
            # Directories for each sample
            sample1_dir = os.path.join(miso_output_dir, sample1)
            sample2_dir = os.path.join(miso_output_dir, sample2)
            for event_type in misowrap_obj.event_types:
                sample1_event_dir = os.path.join(sample1_dir, event_type)
                sample2_event_dir = os.path.join(sample2_dir, event_type)
                job_name = "compare_%s_%s_%s" % (sample1, sample2, event_type)
                event_comparisons_dir = os.path.join(comparisons_dir, event_type)
                compare_cmd = "%s --compare-samples %s %s %s " "--comparison-labels %s %s" % (
                    misowrap_obj.compare_miso_cmd,
                    sample1_event_dir,
                    sample2_event_dir,
                    event_comparisons_dir,
                    sample1,
                    sample2,
                )
                misowrap_obj.logger.info("Executing: %s" % (compare_cmd))
                if misowrap_obj.use_cluster:
                    if not dry_run:
                        misowrap_obj.my_cluster.launch_job(compare_cmd, job_name, ppn=1)
                        time.sleep(delay)
                else:
                    if not dry_run:
                        os.system(compare_cmd)
Ejemplo n.º 24
0
def main():
    from optparse import OptionParser

    parser = OptionParser()
    parser.add_option(
        "--input-gff", dest="input_gff", default=None, nargs=1, help="Input GFF filename for a GFF database."
    )
    parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.")
    parser.add_option("--gtf", dest="gtf", default=False, action="store_true", help="Output a GTF file instead of GFF.")
    parser.add_option(
        "--db-subdir",
        dest="db_subdir",
        default="gff_db",
        help="Name of output subdirectory containing GFF "
        "database. By default, creates 'gff_db' "
        "subdirectory in the directory given to --output-dir.",
    )
    parser.add_option(
        "--no-db-output", dest="no_db_output", default=False, action="store_true", help="Do not output a GFF database."
    )

    (options, args) = parser.parse_args()

    if options.output_dir is None:
        print "Error: need --output-dir to be provided.\n"
        greeting()
        sys.exit(1)

    output_dir = options.output_dir
    output_dir = utils.pathify(options.output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.input_gff is not None:
        gff_fname = utils.pathify(options.input_gff)
        sanitize_gff(gff_fname, output_dir)
Ejemplo n.º 25
0
def make_annotation(args):
    """
    Make GFF annotation. Takes GFF tables directory
    and an output directory.
    """
    tables_dir = utils.pathify(args.tables_dir)
    output_dir = utils.pathify(args.output_dir)
    print "Making GFF alternative events annotation..."
    print "  - UCSC tables read from: %s" %(tables_dir)
    print "  - Output dir: %s" %(output_dir)
    t1 = time.time()
    table_fnames = def_events.load_ucsc_tables(tables_dir)
    num_tables = len(table_fnames)
    if num_tables == 0:
        raise Exception, "No UCSC tables found in %s." %(tables_dir)
    print "Loaded %d UCSC tables." %(num_tables)
    def_events.defineAllSplicing(tables_dir, output_dir,
                                 flanking=args.flanking_rule,
                                 multi_iso=args.multi_iso,
                                 genome_label=args.genome_label,
                                 sanitize=args.sanitize)
    t2 = time.time()
    print "Took %.2f minutes to make the annotation." \
          %((t2 - t1)/60.)
Ejemplo n.º 26
0
def compute_insert_lens(settings,
                        output_dir,
                        dry_run=False):
    """
    Compute insert lengths for all samples.
    """
    settings_filename = utils.pathify(settings)
    logs_outdir = utils.pathify(logs_outdir)
    misowrap_obj = mw.MISOWrap(settings_filename,
                               logs_outdir,
                               logger_label="insert_lens")
    const_exons_gff = misowrap_obj.const_exons_gff
    if not os.path.isfile(const_exons_gff):
        print "Error: %s const exons GFF does not exist." \
            %(const_exons_gff)
        sys.exit(1)
    pe_utils_path = misowrap_obj.pe_utils_cmd 
    insert_len_output_dir = os.path.join(output_dir, "insert_lens")
    num_bams = len(misowrap_obj.bam_files)
    
    print "Computing insert lengths for %d files" %(num_bams)
    for bam_filename, sample_name in misowrap_obj.bam_files:
        print "Processing: %s" %(bam_filename)
        insert_len_cmd = "%s --compute-insert-len %s %s --output-dir %s" \
            %(pe_utils_path,
              bam_filename,
              const_exons_gff,
              insert_len_output_dir)
        print "Executing: %s" %(insert_len_cmd)
        job_name = "%s_insert_len" %(sample_name)
        if misowrap_obj.use_cluster:
            misowrap_obj.my_cluster.launch_job(insert_len_cmd,
                                               job_name,
                                               ppn=1)
        else:
            os.system(insert_len_cmd)
Ejemplo n.º 27
0
def summarize(settings, logs_outdir, delay=5, dry_run=False):
    """
    Summarize samples in MISO directory.
    """
    settings_filename = utils.pathify(settings)
    misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="summarize")
    bam_files = misowrap_obj.bam_files
    sample_labels = misowrap_obj.sample_labels
    print "Summarizing MISO output..."
    for sample_label in sample_labels:
        sample_basename = sample_label[0]
        sample_dir_path = utils.pathify(os.path.join(misowrap_obj.miso_outdir, sample_basename))
        print "Processing: %s" % (sample_basename)
        if not os.path.isdir(sample_dir_path):
            print "Skipping non-directory: %s" % (sample_dir_path)
        # List all event directories in the sample
        event_dirs = os.listdir(sample_dir_path)
        for event_dirname in event_dirs:
            event_dir_path = utils.pathify(os.path.join(sample_dir_path, event_dirname))
            if not os.path.isdir(event_dir_path):
                print "Skipping non-dir: %s" % (event_dir_path)
            print "Processing event type: %s" % (event_dirname)
            summary_cmd = "%s --summarize-samples %s %s --summary-label %s" % (
                misowrap_obj.summarize_miso_cmd,
                event_dir_path,
                event_dir_path,
                sample_basename,
            )
            job_name = "summarize_%s_%s" % (sample_basename, os.path.basename(event_dirname))
            print "Executing: %s" % (summary_cmd)
            if misowrap_obj.use_cluster:
                if not dry_run:
                    misowrap_obj.my_cluster.launch_job(summary_cmd, job_name, ppn=1)
            else:
                if not dry_run:
                    os.system(summary_cmd)
Ejemplo n.º 28
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
      
    parser.add_option("--run", dest="run", nargs=1, default=None,
                      help="Run MISO on a set of events. "
                      "Takes a settings filename.")
    parser.add_option("--summarize", dest="summarize", nargs=1, default=None,
                      help="Run MISO summarize on a set of samples. "
                      "Takes a settings filename.")
    parser.add_option("--compare", dest="compare", nargs=1, default=None,
                      help="Run MISO sample comparisons on all pairwise "
                      "comparisons. Takes a settings filename.")
    parser.add_option("--filter", dest="filter", nargs=1,
                      default=None,
                      help="Filter a set of MISO events. "
                      "Takes a settings filename.")
    parser.add_option("--compute-insert-lens", dest="compute_insert_lens",
                      nargs=1, default=None,
                      help="Compute insert lengths for a set of BAM files. " 
                      "takes a settings filename.")
    parser.add_option("--output-dir", dest="output_dir", default=None,
                      help="Output directory.")
    (options, args) = parser.parse_args()

    greeting()

    if options.output_dir == None:
        print "Error: need --output-dir.\n"
        parser.print_help()
        sys.exit(1)
        
    output_dir = utils.pathify(options.output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.run != None:
        settings_filename = utils.pathify(options.run)
        run_miso_on_samples(settings_filename, output_dir)

    if options.summarize != None:
        settings_filename = utils.pathify(options.summarize)
        summarize_miso_samples(settings_filename, output_dir)
        
    if options.compare != None:
        settings_filename = utils.pathify(options.compare)
        compare_miso_samples(settings_filename, output_dir)

    if options.filter != None:
        settings_filename = utils.pathify(options.filter)
        filter_events(settings_filename, output_dir)

    if options.compute_insert_lens != None:
        settings_filename = utils.pathify(options.compute_insert_lens)
        compute_insert_lens(settings_filename, output_dir)
Ejemplo n.º 29
0
def read_pe_params(insert_len_filename):
    """
    Get paired-end parameters from .insert_len file.
    """
    insert_len_filename = utils.pathify(insert_len_filename)
    if not os.path.isfile(insert_len_filename):
        print "Error: %s not a file." %(insert_len_filename)
        sys.exit(1)
    insert_file = open(insert_len_filename, "r")
    fields = insert_file.readline()[1:].strip().split(",")
    pe_params = {}
    for field in fields:
        k, v = field.split("=")
        pe_params[k] = float(v)
    insert_file.close()
    return pe_params
Ejemplo n.º 30
0
    def load_events_to_genes(self,
                             source="ensGene",
                             delimiter="\t"):
        """
        Load mapping from events to genes.

        Expects a directory with files named
        according to events, e.g.:
        
          SE.mm9.gff3_to_ensGene.txt        
        """
        if "events_to_genes_dir" not in self.settings_info["settings"]:
            return
        events_to_genes_dir = \
            self.settings_info["settings"]["events_to_genes_dir"]
        events_to_genes_dir = utils.pathify(events_to_genes_dir)
        print "Loading events to genes mapping from: %s" \
            %(events_to_genes_dir)
        # If we're given mapping from events to genes, load
        # these and index them by event type.
        if not os.path.isdir(events_to_genes_dir):
            print "Error: %s not a directory."
            sys.exit(1)
        basename_card = "*_to_%s.txt" %(source)
        events_to_genes_files = \
            glob.glob(os.path.join(events_to_genes_dir,
                                   basename_card))
        if len(events_to_genes_files) == 0:
            print "Error: %s directory contains no %s files." \
                %(events_to_genes_dir,
                  basename_card)
            sys.exit(1)
        self.events_to_genes = defaultdict(lambda: defaultdict(list))
        for fname in events_to_genes_files:
            # Extract event type based on filename
            event_type = os.path.basename(fname).split(".")[0]
            with open(fname, "r") as events_file:
                events_entries = csv.DictReader(events_file,
                                                delimiter=delimiter)
                for entry in events_entries:
                    event_id = entry["event_id"]
                    # Parse genes into a list
                    genes = entry["gene_id"].split(",")
                    # Index events by their type and then by
                    # their ID
                    self.events_to_genes[event_type][event_id].extend(genes)
Ejemplo n.º 31
0
 def __init__(self, settings_filename, output_dir,
              logger_label=None):
     self.settings_filename = settings_filename
     self.settings_info = None
     self.logger_label = None
     # Main output directory
     self.output_dir = utils.pathify(output_dir)
     utils.make_dir(self.output_dir)
     # MISO output directory (where raw output is)
     self.miso_outdir = None
     # Comparisons output directory
     self.comparisons_outdir = None
     # BAM files to process
     self.bam_files = None
     # Sample labels
     self.sample_labels = None
     self.comparison_groups = None
     # Insert length directory (for paired-end samples)
     self.insert_lens_dir = None
     # Logs output directory
     self.logs_outdir = None
     # Logger object
     self.logger = None
     # Cluster submission object
     self.my_cluster = None
     # Event types to process
     self.event_types = None
     # Whether to submit jobs to cluster
     self.use_cluster = False
     # run_miso cmd
     self.run_miso_cmd = None
     # run_events_analysis cmd
     self.run_events_cmd = None
     # Constitutive exons GFF file: used to compute
     # the insert length distribution
     self.const_exons_gff = None
     # Load settings
     self.load_settings()
     ##
     ## Load annotation of events, like a map
     ## events to genes.
     ##
     self.events_to_genes = None
     self.load_events_to_genes()
Ejemplo n.º 32
0
 def __init__(self, settings_filename, output_dir, logger_label=None):
     self.settings_filename = settings_filename
     self.settings_info = None
     self.logger_label = None
     # Main output directory
     self.output_dir = utils.pathify(output_dir)
     utils.make_dir(self.output_dir)
     # MISO output directory (where raw output is)
     self.miso_outdir = None
     # Comparisons output directory
     self.comparisons_outdir = None
     # BAM files to process
     self.bam_files = None
     # Sample labels
     self.sample_labels = None
     self.comparison_groups = None
     # Insert length directory (for paired-end samples)
     self.insert_lens_dir = None
     # Logs output directory
     self.logs_outdir = None
     # Logger object
     self.logger = None
     # Cluster submission object
     self.my_cluster = None
     # Event types to process
     self.event_types = None
     # Whether to submit jobs to cluster
     self.use_cluster = False
     # run_miso cmd
     self.run_miso_cmd = None
     # run_events_analysis cmd
     self.run_events_cmd = None
     # Constitutive exons GFF file: used to compute
     # the insert length distribution
     self.const_exons_gff = None
     # Load settings
     self.load_settings()
     ##
     ## Load annotation of events, like a map
     ## events to genes.
     ##
     self.events_to_genes = None
     self.load_events_to_genes()
Ejemplo n.º 33
0
def get_bf_filename(pairwise_comparison_dir):
    """
    Return a Bayes factor filename from a
    pairwise comparisons directory.
    """
    pairwise_comparison_dir = utils.pathify(pairwise_comparison_dir)
    if not os.path.isdir(pairwise_comparison_dir):
        print "WARNING: Could not find %s" %(pairwise_comparison_dir)
        return None
    bf_dir = os.path.join(pairwise_comparison_dir, "bayes-factors")
    if not os.path.isdir(bf_dir):
        # Attempt current directory without "bayes-factor"
        # inner directory
        bf_dir = pairwise_comparison_dir
    bf_filename = glob.glob(os.path.join(bf_dir,
                                         "*.miso_bf"))
    if len(bf_filename) > 1:
        print "Error: Multiple BF filenames in %s" %(bf_dir)
        return None
    bf_filename = bf_filename[0]
    return bf_filename
Ejemplo n.º 34
0
 def load_pipeline_settings(self):
     """
     Load the settings filename
     """
     if not os.path.isfile(self.settings_filename):
         print "Error: %s is not a settings filename." % (self.settings_filename)
         sys.exit(1)
     self.settings = settings.load_settings(self.settings_filename)
     self.settings_info, self.parsed_settings = self.settings
     self.genome = self.settings_info["mapping"]["genome"]
     # Determine if we're in paired-end mode
     self.is_paired_end = False
     if self.settings_info["mapping"]["paired"]:
         self.is_paired_end = True
     # Load the sequence files
     self.load_sequence_files()
     # Load the directory where pipeline output should go
     self.output_dir = utils.pathify(self.settings_info["data"]["outdir"])
     print "Loaded pipeline settings (source: %s)." % (self.settings_filename)
     # Pipeline init directory
     self.init_dir = os.path.join(self.settings_info["pipeline-files"]["init_dir"])
     # Loading group information if there is any
     self.load_groups()
Ejemplo n.º 35
0
 def load_sequence_files(self):
     """
     Load sequence files from settings file.
     """
     if self.settings_info is None:
         print "Error: cannot load sequence files if settings " "are not loaded."
         sys.exit(1)
     seq_files = self.settings_info["data"]["sequence_files"]
     # Get the absolute path names, with the prefix input directory,
     # for each sequence file
     sequence_filenames = []
     input_dir = utils.pathify(self.settings_info["data"]["indir"])
     for seq_entry in seq_files:
         if len(seq_entry) != 2:
             print "Error: Must provide a sequence filename and a " "sample label for each entry."
             sys.exit(1)
         fname, seq_label = seq_entry
         seq_fname = os.path.join(input_dir, fname)
         if not os.path.isfile(seq_fname):
             print "Error: Cannot find sequence file %s" % (seq_fname)
             sys.exit(1)
         sequence_filenames.append([seq_fname, seq_label])
     self.sequence_filenames = sequence_filenames
     return sequence_filenames
Ejemplo n.º 36
0
def launchJob(cmd,
              job_name,
              scriptOptions,
              verbose=False,
              test=False,
              fast=False,
              queue_type="quick",
              ppn="4"):
    """
    Submits a job on the cluster which will run command 'cmd',
    with options 'scriptOptions'

    Optionally:
    verbose: output the job script
    test: don't actually submit the job script
          (usually used in conjunction with verbose)
    fast: submit only to the fast nodes on coyote

    Returns a job ID if the job was submitted properly
    """
    if type(cmd) not in [type(list()), type(tuple())]:
        cmd = [cmd]

    scriptOptions.setdefault("workingdir", os.getcwd())
    scriptOptions.setdefault("nodes", "1")
    scriptOptions.setdefault("ppn", str(ppn))
    scriptOptions.setdefault("jobname", job_name)
    scriptOptions.setdefault("scriptuser", getpass.getuser())
    scriptOptions.setdefault("queue", queue_type)
    scriptOptions.setdefault("outdir", "")

    scriptOptions["command"] = " ".join(cmd)

    pid = os.getpid()
    outscriptName = "%s.%i" % (scriptOptions["jobname"], pid)

    scriptOptions["outf"] = \
        utils.pathify(os.path.join(scriptOptions["outdir"],
                                   outscriptName+".out"))

    if fast:
        assert scriptOptions["nodes"] == "1", \
            "Can only choose specific nodes if you're " \
            "not restricting jobs to the fast nodes."
        scriptOptions["nodes"] = "1:E5450"

    outtext = """#!/bin/bash

    #PBS -l nodes=%(nodes)s:ppn=%(ppn)s
    #PBS -j oe
    #PBS -o %(outf)s

    #PBS -m a
    #PBS -M %(scriptuser)[email protected]
    #PBS -N %(jobname)s
    #PBS -q %(queue)s

    #PBS -S /bin/bash

    echo $HOSTNAME

    echo Working directory is %(workingdir)s
    cd %(workingdir)s

    echo "%(command)s"
    %(command)s
    echo "===== %(command)s finished =====" """ % scriptOptions

    if verbose:
        print outscriptName
        print outtext

    call = "qsub -"

    if not test:
        try:
            qsub = subprocess.Popen(call,
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    stdin=subprocess.PIPE)
            print "Executing: ", scriptOptions["command"]
            qsub.stdin.write(outtext)

            output = qsub.communicate()

            if output[0].strip().endswith(".coyote.mit.edu"):
                jobID = int(output[0].split(".")[0])

                if verbose:
                    print "Process launched with job ID:", jobID

                return jobID
            else:
                raise Exception("Failed to launch job '%s': %s" \
                                %(outscriptName,
                                  str(output)))
        except:
            print "failing..."
            raise
    return None
Ejemplo n.º 37
0
def combine_comparisons(
    settings,
    logs_outdir,
    common_cols=["isoforms", "chrom", "strand", "mRNA_starts", "mRNA_ends", "gene_id", "gene_symbol"],
    delay=5,
    dry_run=False,
    NA_VAL="NA",
):
    """
    Output combined MISO comparisons. For each event type,
    combine the MISO comparisons for the relevant groups
    based on the 'comparison_groups' in the misowrap
    settings file.
    """
    settings_filename = utils.pathify(settings)
    logs_outdir = utils.pathify(logs_outdir)
    utils.make_dir(logs_outdir)
    misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="combine_comparisons")
    comparisons_dir = misowrap_obj.comparisons_dir
    if not os.path.isdir(comparisons_dir):
        misowrap_obj.logger.critical("Comparisons directory %s not found. " % (comparisons_dir))
        sys.exit(1)
    # Comparison types to combine: unfiltered comparisons and filtered comparisons
    # (if available)
    unfiltered_comp_dir = os.path.join(comparisons_dir, "combined_comparisons")
    filtered_comp_dir = os.path.join(comparisons_dir, "filtered_events")
    dirs_to_process = [unfiltered_comp_dir, filtered_comp_dir]
    comparison_groups = misowrap_obj.comparison_groups
    for curr_comp_dir in dirs_to_process:
        if not os.path.isdir(curr_comp_dir):
            print "Comparisons directory %s not found, skipping" % (curr_comp_dir)
            continue
        # For each event type, output the sample comparisons
        for event_type in misowrap_obj.event_types:
            # Collection of MISO comparison dataframes (to be merged later)
            # for the current event type
            comparison_dfs = []
            comparison_labels = []
            event_dir = os.path.join(curr_comp_dir, event_type)
            if not os.path.isdir(event_dir):
                misowrap_obj.logger.info("Cannot find event type %s dir, " "skipping..." % (event_type))
                continue
            # Look only at sample comparisons within each sample group
            for comp_group in comparison_groups:
                sample_pairs = utils.get_pairwise_comparisons(comp_group)
                misowrap_obj.logger.info("  - Total of %d comparisons" % (len(sample_pairs)))
                for sample1, sample2 in sample_pairs:
                    # Load miso_bf file for the current comparison
                    # and join it to the combined df
                    comparison_name = "%s_vs_%s" % (sample1, sample2)
                    bf_data = miso_utils.load_miso_bf_file(event_dir, comparison_name, substitute_labels=True)
                    if bf_data is None:
                        misowrap_obj.logger.warning("Could not find comparison %s" % (comparison_name))
                        continue
                    comparison_dfs.append(bf_data)
                    comparison_labels.append(comparison_name)
            # Merge the comparison dfs together
            print "Merging comparisons for %s" % (event_type)
            combined_df = pandas_utils.combine_dfs(comparison_dfs)
            output_filename = os.path.join(output_dir, "%s.miso_bf" % (event_type))
            misowrap_obj.logger.info("Outputting %s results to: %s" % (event_type, output_filename))
            if not dry_run:
                combined_df.to_csv(output_filename, float_format="%.4f", sep="\t", na_rep=NA_VAL, index=True)
Ejemplo n.º 38
0
 def load_settings(self):
     """
     Load settings for misowrap.
     """
     settings_info, parsed_settings = \
           misowrap_settings.load_misowrap_settings(self.settings_filename)
     self.settings_info = settings_info
     # Load basic settings about data
     self.read_len = self.settings_info["settings"]["readlen"]
     self.overhang_len = self.settings_info["settings"]["overhanglen"]
     self.miso_bin_dir = \
       utils.pathify(self.settings_info["settings"]["miso_bin_dir"])
     self.miso_settings_filename = \
       utils.pathify(self.settings_info["settings"]["miso_settings_filename"])
     self.miso_events_dir = \
       utils.pathify(self.settings_info["settings"]["miso_events_dir"])
     self.miso_outdir = \
       utils.pathify(self.settings_info["settings"]["miso_output_dir"])
     # Load data-related parameters
     self.bam_files = self.settings_info["data"]["bam_files"]
     if "insert_lens_dir" in self.settings_info["data"]:
         self.insert_lens_dir = \
           utils.pathify(self.settings_info["data"]["insert_lens_dir"])
     # Sample labels
     self.sample_labels = self.settings_info["data"]["sample_labels"]
     # Set output directories
     self.comparisons_dir = os.path.join(self.output_dir, "comparisons")
     self.comparison_groups = \
         self.settings_info["data"]["comparison_groups"]
     self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs")
     # Create necessary directories
     utils.make_dir(self.logs_outdir)
     if "cluster_type" in self.settings_info["settings"]:
         self.use_cluster = True
         self.cluster_type = \
             self.settings_info["settings"]["cluster_type"]
         self.chunk_jobs = \
             self.settings_info["settings"]["chunk_jobs"]
     if self.use_cluster:
         print "Loading cluster information."
         # Load cluster object if given a cluster type
         self.load_cluster()
     # Create a logger object
     if self.logger_label is None:
         self.logger_label = "misowrap"
     else:
         self.logger_label = "misowrap_%s" % (logger_label)
     self.logger = utils.get_logger(self.logger_label, self.logs_outdir)
     # Whether to prefilter MISO events
     # Set general default settings
     if "prefilter_miso" not in settings_info["settings"]:
         # By default, set it so that MISO events are not
         # prefiltered
         settings_info["settings"]["prefilter_miso"] = False
     self.prefilter_miso = \
         self.settings_info["settings"]["prefilter_miso"]
     # Load event types
     self.load_event_types()
     # Set path to MISO scripts
     self.compare_miso_cmd = os.path.join(self.miso_bin_dir, "compare_miso")
     self.summarize_miso_cmd = os.path.join(self.miso_bin_dir,
                                            "summarize_miso")
     self.run_events_cmd = os.path.join(self.miso_bin_dir, "miso")
     self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils")
     # Files related to gene tables
     self.tables_dir = \
         os.path.join(self.settings_info["pipeline-files"]["init_dir"],
                      "ucsc")
     if not os.path.isdir(self.tables_dir):
         print "Error: %s directory does not exist." \
             %(self.tables_dir)
         sys.exit(1)
     self.const_exons_gff = os.path.join(self.tables_dir, "exons",
                                         "const_exons",
                                         "ensGene.const_exons.gff")
     if not os.path.isfile(self.const_exons_gff):
         print "Error: Const. exons GFF %s does not exist." \
             %(self.const_exons_gff)
         sys.exit(1)
Ejemplo n.º 39
0
def run(
    settings,
    logs_outdir,
    use_cluster=True,
    base_delay=10,
    # Batch delay (20 mins by default)
    batch_delay=60 * 20,
    delay_every_n_jobs=30,
    dry_run=False,
    event_types=None,
):
    """
    Run MISO on a set of samples.
    """
    settings_filename = utils.pathify(settings)
    if event_types is not None:
        print "Only running MISO on event types: ", event_types
    misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="run")
    output_dir = misowrap_obj.miso_outdir
    bam_files = misowrap_obj.bam_files
    read_len = misowrap_obj.read_len
    overhang_len = misowrap_obj.overhang_len
    events_dir = misowrap_obj.miso_events_dir
    single_end = True
    if misowrap_obj.insert_lens_dir is not None:
        insert_lens_dir = misowrap_obj.insert_lens_dir
        misowrap_obj.logger.info("Running in paired-end mode...")
        misowrap_obj.logger.info(" - Insert length directory: %s" % (insert_lens_dir))
        single_end = False
    else:
        misowrap_obj.logger.info("Running in single-end mode...")
    run_events_analysis = misowrap_obj.run_events_cmd
    event_types_dirs = miso_utils.get_event_types_dirs(misowrap_obj.settings_info)
    miso_settings_filename = misowrap_obj.miso_settings_filename
    n = 0
    for bam_input in bam_files:
        bam_filename, sample_label = bam_input
        bam_filename = utils.pathify(bam_filename)
        misowrap_obj.logger.info("Processing: %s" % (bam_filename))
        for event_type_dir in event_types_dirs:
            event_type = os.path.basename(event_type_dir)
            if event_types is not None:
                if event_type not in event_types:
                    print "Skipping event type: %s" % (event_type)
                    continue
            print "  - Using event dir: %s" % (event_type_dir)
            miso_cmd = "%s" % (run_events_analysis)
            bam_basename = os.path.basename(bam_filename)
            # Output directory for sample
            sample_output_dir = os.path.join(output_dir, sample_label, event_type)
            # Pass sample to MISO along with event
            miso_cmd += " --run %s %s" % (event_type_dir, bam_filename)
            if not single_end:
                insert_len_filename = os.path.join(insert_lens_dir, "%s.insert_len" % (bam_basename))
                misowrap_obj.logger.info("Reading paired-end parameters " "from file...")
                misowrap_obj.logger.info("  - PE file: %s" % (insert_len_filename))
                pe_params = miso_utils.read_pe_params(insert_len_filename)
                # Paired-end parameters
                miso_cmd += " --paired-end %.2f %.2f" % (pe_params["mean"], pe_params["sdev"])
            # Read length
            miso_cmd += " --read-len %d" % (read_len)
            # Overhang length
            miso_cmd += " --overhang-len %d" % (overhang_len)
            # Prefilter?
            if misowrap_obj.prefilter_miso:
                miso_cmd += " --prefilter"
            # Output directory
            miso_cmd += " --output-dir %s" % (sample_output_dir)
            # Use cluster
            if misowrap_obj.use_cluster:
                miso_cmd += " --use-cluster"
                miso_cmd += " --chunk-jobs %d" % (misowrap_obj.chunk_jobs)
            # Settings
            miso_cmd += " --settings %s" % (miso_settings_filename)
            misowrap_obj.logger.info("Executing: %s" % (miso_cmd))
            job_name = "%s_%s" % (sample_label, event_type)
            if use_cluster:
                if not dry_run:
                    misowrap_obj.my_cluster.launch_job(miso_cmd, job_name, ppn=1)
                if n == delay_every_n_jobs:
                    # Larger delay everytime we've submitted n jobs
                    misowrap_obj.logger.info("Submitted %d jobs, now waiting %.2f mins." % (n, batch_delay / 60.0))
                    time.sleep(batch_delay)
                    n = 0
                time.sleep(base_delay)
            else:
                if not dry_run:
                    os.system(miso_cmd)
            n += 1
Ejemplo n.º 40
0
def main():
    from optparse import OptionParser
    parser = OptionParser()

    parser.add_option(
        "--input-gff",
        dest="input_gff",
        default=None,
        nargs=1,
        help="Fetch sequence from GFF events file. Takes as input: "
        "GFF filename.")
    parser.add_option("--fi",
                      dest="fasta_fname",
                      default=None,
                      nargs=1,
                      help="FASTA filename to fetch sequences from.")
    parser.add_option(
        "--with-flanking-introns",
        dest="with_flanking_introns",
        default=False,
        action="store_true",
        help="Get sequence of flanking introns relative to skipped exon.")
    parser.add_option(
        "--flanking-introns-coords",
        dest="flanking_introns_coords",
        default=None,
        nargs=4,
        help="Fetch the sequences of the flanking introns "
        "(for SpliceGraph events). Takes as input the intervals to "
        "be used, which are: "
        "(1) start position relative to 5 prime splice site of SE "
        "(negative int), "
        "(2) end position 5 prime splice site (negative int), "
        "(3) start position relative to 3 prime splice site "
        "(positive int), "
        "(4) end position relative to 3 prime splice site. "
        "(posiitve int). "
        "Suggested settings are -250, -20, 20, -250.")
    parser.add_option("--output-dir",
                      dest="output_dir",
                      nargs=1,
                      default=None,
                      help="Output directory.")
    (options, args) = parser.parse_args()

    if options.output_dir is None:
        greeting()
        print "Error: need --output-dir to be provided."
        sys.exit(1)

    output_dir = options.output_dir
    output_dir = utils.pathify(options.output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if options.input_gff is not None:
        if options.fasta_fname is None:
            greeting()
            print "Error: Must provide input fasta file with --fi."
            sys.exit(1)
        # Check for FASTA
        gff_filename = utils.pathify(options.input_gff)
        fasta_fname = utils.pathify(options.fasta_fname)
        flanking_introns_coords = options.flanking_introns_coords
        gffutils_helpers.fetch_seq_from_gff(
            gff_filename,
            fasta_fname,
            output_dir,
            with_flanking_introns=options.with_flanking_introns,
            flanking_introns_coords=options.flanking_introns_coords)
Ejemplo n.º 41
0
def combine_comparisons(settings,
                        logs_outdir,
                        common_cols=["isoforms",
                                     "chrom",
                                     "strand",
                                     "mRNA_starts",
                                     "mRNA_ends",
                                     "gene_id",
                                     "gene_symbol"],
                        delay=5,
                        dry_run=False,
                        NA_VAL="NA"):
    """
    Output combined MISO comparisons. For each event type,
    combine the MISO comparisons for the relevant groups
    based on the 'comparison_groups' in the misowrap
    settings file.
    """
    settings_filename = utils.pathify(settings)
    logs_outdir = utils.pathify(logs_outdir)
    utils.make_dir(logs_outdir)
    misowrap_obj = mw.MISOWrap(settings_filename,
                               logs_outdir,
                               logger_label="combine_comparisons")
    comparisons_dir = misowrap_obj.comparisons_dir    
    if not os.path.isdir(comparisons_dir):
        misowrap_obj.logger.critical("Comparisons directory %s not found. " \
                                     %(comparisons_dir))
        sys.exit(1)
    # Comparison types to combine: unfiltered comparisons and filtered comparisons
    # (if available)
    unfiltered_comp_dir = os.path.join(comparisons_dir,
                                       "combined_comparisons")
    filtered_comp_dir = os.path.join(comparisons_dir,
                                     "filtered_events")
    dirs_to_process = [unfiltered_comp_dir, filtered_comp_dir]
    comparison_groups = misowrap_obj.comparison_groups
    for curr_comp_dir in dirs_to_process:
        if not os.path.isdir(curr_comp_dir):
            print "Comparisons directory %s not found, skipping" %(curr_comp_dir)
            continue
        # For each event type, output the sample comparisons
        for event_type in misowrap_obj.event_types:
            # Collection of MISO comparison dataframes (to be merged later)
            # for the current event type
            comparison_dfs = []
            comparison_labels = []
            event_dir = os.path.join(curr_comp_dir, event_type)
            if not os.path.isdir(event_dir):
                misowrap_obj.logger.info("Cannot find event type %s dir, " \
                                         "skipping..." %(event_type))
                continue
            # Look only at sample comparisons within each sample group
            for comp_group in comparison_groups:
                sample_pairs = utils.get_pairwise_comparisons(comp_group)
                misowrap_obj.logger.info("  - Total of %d comparisons" \
                                         %(len(sample_pairs)))
                for sample1, sample2 in sample_pairs:
                    # Load miso_bf file for the current comparison
                    # and join it to the combined df
                    comparison_name = "%s_vs_%s" %(sample1, sample2)
                    bf_data = miso_utils.load_miso_bf_file(event_dir,
                                                           comparison_name,
                                                           substitute_labels=True)
                    if bf_data is None:
                        misowrap_obj.logger.warning("Could not find comparison %s" \
                                                    %(comparison_name))
                        continue
                    comparison_dfs.append(bf_data)
                    comparison_labels.append(comparison_name)
            # Merge the comparison dfs together
            print "Merging comparisons for %s" %(event_type)
            combined_df = pandas_utils.combine_dfs(comparison_dfs)
            output_filename = os.path.join(output_dir,
                                           "%s.miso_bf" %(event_type))
            misowrap_obj.logger.info("Outputting %s results to: %s" \
                                     %(event_type, output_filename))
            if not dry_run:
                combined_df.to_csv(output_filename,
                                   float_format="%.4f",
                                   sep="\t",
                                   na_rep=NA_VAL,
                                   index=True)
Ejemplo n.º 42
0
def annotate_gff_with_genes(args):
    """
    Annotate GFF with genes table.
    """
    gff_fname = utils.pathify(args.gff_filename)
    if not os.path.isfile(gff_fname):
        raise Exception, "Cannot find %s" % (gff_fname)
    table_fname = utils.pathify(args.table_filename)
    if not os.path.isfile(table_fname):
        raise Exception, "Cannot find %s" % (table_fname)
    table_bed = get_table_as_bedtool(table_fname)
    # Get BedTool for events, containing only the gene entries
    all_events_bed = pybedtools.BedTool(gff_fname)
    event_genes = \
        all_events_bed.filter(lambda entry: entry.fields[2] == "gene")
    print "Determining overlap between events and genes..."
    # Intersect event genes with gene txStart/txEnd
    intersected_bed = \
        event_genes.intersect(table_bed, wb=True, s=True, f=1)
    # Map event genes to their IDs
    #
    #  event_gene1 -> refseq  -> value
    #              -> ensgene -> value
    #  event_gene2 -> refseq  ->
    #  ...
    event_genes_to_info = \
        defaultdict(lambda: defaultdict(list))
    for entry in intersected_bed:
        event_gene_attrs = utils.parse_attributes(entry.fields[8])
        event_gene_str = event_gene_attrs["ID"]
        gene_info_field = entry.fields[-1]
        # Strip semicolon of ID attributes
        if gene_info_field.endswith(";"):
            gene_info_field = gene_info_field[0:-1]
        # Convert attributes into dictionary
        gene_info = utils.parse_attributes(gene_info_field)
        ensgene_id = gene_info["ensg_id"]
        refseq_id = gene_info["refseq_id"]
        gene_symbol = gene_info["gsymbol"]
        # Skip null entries
        if not is_null_id(ensgene_id):
            event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id)
        if not is_null_id(refseq_id):
            event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id)
        if not is_null_id(gene_symbol):
            event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol)
    # Incorporate the gene information into the GFF and output it
    # it using gffutils
    print "Loading events into GFF database..."
    events_db = gffutils.create_db(gff_fname, ":memory:", verbose=False)
    output_fname = gff_fname
    events_out = gffwriter.GFFWriter(output_fname, in_place=True)
    print " - Outputting annotated GFF to: %s" % (output_fname)

    def new_recs():
        for gene_recs in list(events_db.iter_by_parent_childs()):
            gene_rec = gene_recs[0]
            event_id = gene_rec.id
            # Use existing IDs if present
            if "ensgene_id" in gene_rec.attributes:
                ensgene_id = gene_rec.attributes["ensg_id"][0]
            else:
                ensgene_id = "NA"
            if "refseq_id" in gene_rec.attributes:
                refseq_id = gene_rec.attributes["refseq_id"][0]
            else:
                refseq_id = "NA"
            if "gene_symbol" in gene_rec.attributes:
                gene_symbol = gene_rec.attributes["gsymbol"][0]
            else:
                gene_symbol = "NA"
            if event_id in event_genes_to_info:
                event_info = event_genes_to_info[event_id]
                ensgene_ids = \
                    utils.unique_list(event_info["ensg_id"])
                if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA":
                    ensgene_id = ",".join(ensgene_ids)
                refseq_ids = \
                    utils.unique_list(event_info["refseq_id"])
                if len(refseq_ids) > 0 and refseq_ids[0] != "NA":
                    refseq_id = ",".join(refseq_ids)
                gene_symbols = \
                    utils.unique_list(event_info["gsymbol"])
                if len(gene_symbols) > 0 and gene_symbols[0] != "NA":
                    gene_symbol = ",".join(gene_symbols)
            gene_rec.attributes["ensg_id"] = [ensgene_id]
            gene_rec.attributes["refseq_id"] = [refseq_id]
            gene_rec.attributes["gsymbol"] = [gene_symbol]
            # Yield all the gene's records
            for g in gene_recs:
                yield g

    t1 = time.time()
    print "Creating annotated GFF database..."
    annotated_db = gffutils.create_db(new_recs(), ":memory:", verbose=False)
    t2 = time.time()
    print "Creation took %.2f secs" % (t2 - t1)
    # Write to file
    print "Writing annotated GFF to file..."
    for gene_rec in annotated_db.all_features(featuretype="gene"):
        events_out.write_gene_recs(annotated_db, gene_rec.id)
    events_out.close()
Ejemplo n.º 43
0
 def load_settings(self):
     """
     Load settings for misowrap.
     """
     settings_info, parsed_settings = \
             misowrap_settings.load_misowrap_settings(self.settings_filename)
     self.settings_info = settings_info
     # Load basic settings about data
     self.read_len = self.settings_info["settings"]["readlen"]
     self.overhang_len = self.settings_info["settings"]["overhanglen"]
     self.miso_bin_dir = \
       utils.pathify(self.settings_info["settings"]["miso_bin_dir"])
     self.miso_settings_filename = \
       utils.pathify(self.settings_info["settings"]["miso_settings_filename"])
     self.miso_events_dir = \
       utils.pathify(self.settings_info["settings"]["miso_events_dir"])
     self.miso_outdir = \
       utils.pathify(self.settings_info["settings"]["miso_output_dir"])
     # Load data-related parameters
     self.bam_files = self.settings_info["data"]["bam_files"]
     if "insert_lens_dir" in self.settings_info["data"]:
         self.insert_lens_dir = \
           utils.pathify(self.settings_info["data"]["insert_lens_dir"])
     # Sample labels
     self.sample_labels = self.settings_info["data"]["sample_labels"]
     # Set output directories
     self.comparisons_dir = os.path.join(self.output_dir, 
                                         "comparisons")
     self.comparison_groups = \
         self.settings_info["data"]["comparison_groups"]
     self.logs_outdir = os.path.join(self.output_dir,
                                     "misowrap_logs")
     # Create necessary directories
     utils.make_dir(self.miso_outdir)
     utils.make_dir(self.comparisons_dir)
     utils.make_dir(self.logs_outdir)
     if "cluster_type" in self.settings_info["settings"]:
         self.use_cluster = True
         self.cluster_type = \
             self.settings_info["settings"]["cluster_type"]
         self.chunk_jobs = \
             self.settings_info["settings"]["chunk_jobs"]
     if self.use_cluster:
         print "Loading cluster information."
         # Load cluster object if given a cluster type
         self.load_cluster()
     # Create a logger object
     if self.logger_label is None:
         self.logger_label = "misowrap"
     else:
         self.logger_label = "misowrap_%s" %(logger_label)
     self.logger = utils.get_logger(self.logger_label,
                                    self.logs_outdir)
     # Whether to prefilter MISO events
     # Set general default settings
     if "prefilter_miso" not in settings_info["settings"]:
         # By default, set it so that MISO events are not
         # prefiltered
         settings_info["settings"]["prefilter_miso"] = False
     self.prefilter_miso = \
         self.settings_info["settings"]["prefilter_miso"]
     # Load event types
     self.load_event_types()
     # Set path to MISO scripts
     self.run_miso_cmd = os.path.join(self.miso_bin_dir,
                                      "run_miso.py")
     self.run_events_cmd = os.path.join(self.miso_bin_dir,
                                        "run_events_analysis.py")
     self.pe_utils_cmd = os.path.join(self.miso_bin_dir,
                                      "pe_utils.py")
     # Files related to gene tables
     self.tables_dir = \
         os.path.join(self.settings_info["pipeline-files"]["init_dir"],
                      "ucsc")
     if not os.path.isdir(self.tables_dir):
         print "Error: %s directory does not exist." \
             %(self.tables_dir)
         sys.exit(1)
     self.const_exons_gff = os.path.join(self.tables_dir,
                                         "exons",
                                         "const_exons",
                                         "ensGene.const_exons.gff")
     if not os.path.isfile(self.const_exons_gff):
         print "Error: Const. exons GFF %s does not exist." \
             %(self.const_exons_gff)
         sys.exit(1)
Ejemplo n.º 44
0
def download_genome_seq(genome,
                        output_dir):
    """
    Download genome sequence files from UCSC.
    """
    print "Downloading genome sequence files for %s" %(genome)
    print "  - Output dir: %s" %(output_dir)
    output_dir = utils.pathify(os.path.join(output_dir, "genome"))
    utils.make_dir(output_dir)
    dir_files = os.listdir(output_dir)
    # Change to output directory
    os.chdir(output_dir)
    ##
    ## Download the genome sequence files
    ##
    genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP,
                                        genome)
    # Fetch all chromosome sequence files
    if len(dir_files) >= 1:
        print "Directory %s exists and contains files; " \
              "skipping download of genome..." \
              %(output_dir)
    else:
        download_utils.wget(os.path.join(genome_url, "*"))
        # Remove random chromosome contigs
        for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")):
            if "_" in os.path.basename(fname):
                print "Deleting: %s" %(fname)
                os.remove(fname)
        ##
        ## Uncompress the files
        ##
        print "Uncompressing files..."
        uncompress_cmd = "gunzip %s/*.gz" %(output_dir)
        print "  - Uncompress cmd: %s" %(uncompress_cmd)
        t1 = time.time()
        ret_val = os.system(uncompress_cmd)
        if ret_val != 0:
            print "Error: Cannot uncompress files in %s" %(output_dir)
            sys.exit(1)
        t2 = time.time()
        print "Uncompressing took %.2f minutes" %((t2 - t1)/60.)
    # Create a single genome FASTA file by concatenating the
    # chromosomes together
    genome_output_fname = \
        os.path.join(output_dir, "%s.fa" %(genome))
    if not os.path.isfile(genome_output_fname):
        print "Concatenating genome chromosomes into one file..."
        print "  - Output file: %s" %(genome_output_fname)
        t1 = time.time()
        concat_chrom_cmd = "cat %s/*.fa > %s" %(output_dir,
                                                genome_output_fname)
        print "  - Concat cmd: %s" %(concat_chrom_cmd)
        ret_val = os.system(concat_chrom_cmd)
        if ret_val != 0:
            print "Error: Could not concatenate genome chromosomes."
            sys.exit(1)
        # Create an index for resulting genome file
        print "Indexing genome file..."
        samtools_index_cmd = "samtools faidx %s" %(genome_output_fname)
        print "  - Index cmd: %s" %(samtools_index_cmd)
        ret_val = os.system(samtools_index_cmd)
        if ret_val != 0:
            print "Error: Could not index genome file."
            sys.exit(1)
        t2 = time.time()
        print "Concatenation and indexing took %.2f minutes" \
            %((t2 - t1)/60.)
Ejemplo n.º 45
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("--run",
                      dest="run",
                      action="store_true",
                      default=False,
                      help="Run pipeline.")
    parser.add_option(
        "--run-on-sample",
        dest="run_on_sample",
        nargs=1,
        default=None,
        help="Run on a particular sample. Takes as input the sample label.")
    parser.add_option("--settings",
                      dest="settings",
                      nargs=1,
                      default=None,
                      help="Settings filename.")
    parser.add_option("--init",
                      dest="initialize",
                      nargs=1,
                      default=None,
                      help="Initialize the pipeline. Takes as input a genome, "
                      "e.g. mm9 or hg18")
    parser.add_option("--output-dir",
                      dest="output_dir",
                      nargs=1,
                      default=None,
                      help="Output directory.")
    ##
    ## Options related to --init
    ##
    parser.add_option("--frac-constitutive", dest="frac_constitutive",
                      nargs=1, default=0.7, type="float",
                      help="Fraction (number between 0 and 1) of " \
                      "transcripts that an exon can be in to be considered " \
                      "constitutive. Default is 0.7 (i.e. 70% of " \
                      "transcripts.)")
    parser.add_option("--constitutive-exon-diff", dest="constitutive_exon_diff",
                      nargs=1, default=10, type="int",
                      help="Number of \'wiggle\' bases by which an exon can " \
                      "differ in order to be considered constitutive. By " \
                      "default set to 10.")
    (options, args) = parser.parse_args()

    greeting()

    if options.output_dir == None:
        print "Error: need --output-dir argument."
        parser.print_help()
        sys.exit(1)

    output_dir = utils.pathify(options.output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    settings_filename = None
    if options.run:
        if options.settings == None:
            # Running of pipeline requires settings filename
            print "Error: need --settings"
            parser.print_help()
            sys.exit(1)
        settings_filename = utils.pathify(options.settings)
        run_pipeline(settings_filename, output_dir)

    if options.run_on_sample is not None:
        if options.settings == None:
            # Running of pipeline requires settings filename
            print "Error: need --settings"
            parser.print_help()
            sys.exit(1)
        settings_filename = utils.pathify(options.settings)
        sample_label = options.run_on_sample
        run_on_sample(sample_label, settings_filename, output_dir)

    if options.initialize is not None:
        # Parse initialization-related settings
        frac_constitutive = float(options.frac_constitutive)
        constitutive_exon_diff = int(options.constitutive_exon_diff)
        init_params = {
            "frac_constitutive": frac_constitutive,
            "constitutive_exon_diff": constitutive_exon_diff
        }
        genome = options.initialize
        initialize_pipeline(genome, output_dir, init_params=init_params)
Ejemplo n.º 46
0
def filter_comparisons(fname, output_dir,
                       event_type=None,
                       atleast_inc=None,
                       atleast_exc=None,
                       atleast_sum=None,
                       gene_table=None,
                       gene_id_cols=["ensg_id", "gsymbol"],
                       dry_run=False):
    """
    Filter a MISO comparison file (*.miso_bf)
    Annotate a GFF file with useful information. For now, add annotation
    of gene IDs based on an input GFF annotation of genes.

    Computes the most inclusive transcription start/end coordinates
    fonr each gene, and then uses pybedtools to intersect (in strand-specific 
    manner) with the input annotation.
    """
    fname = utils.pathify(fname)
    output_dir = utils.pathify(output_dir)
    print "Filtering MISO comparisons file..."
    print "  - MISO comparisons: %s" %(fname)
    print "  - Event type: %s" %(event_type)
    if event_type is not None:
        output_dir = os.path.join(output_dir, event_type)
    utils.make_dir(output_dir)
    print " - Output dir: %s" %(output_dir)
    if "UTR" in event_type:
        def_atleast_inc = tandemutr_atleast_inc
        def_atleast_exc = tandemutr_atleast_exc
        def_atleast_sum = tandemutr_atleast_sum
    elif "SE" in event_type:
        def_atleast_inc = se_atleast_inc
        def_atleast_exc = se_atleast_exc
        def_atleast_sum = se_atleast_sum
    elif "AFE" in event_type:
        def_atleast_inc = afe_atleast_inc
        def_atleast_exc = afe_atleast_exc
        def_atleast_sum = afe_atleast_sum
    elif "ALE" in event_type:
        def_atleast_inc = ale_atleast_inc
        def_atleast_exc = ale_atleast_exc
        def_atleast_sum = ale_atleast_sum
    elif "RI" in event_type:
        def_atleast_inc = ri_atleast_inc
        def_atleast_exc = ri_atleast_exc
        def_atleast_sum = ri_atleast_sum
    else:
        def_atleast_inc = 0
        def_atleast_exc = 0
        def_atleast_sum = 0
    # If read count filters are not given, use the default
    if atleast_inc is None:
        atleast_inc = def_atleast_inc
    if atleast_exc is None:
        atleast_exc = def_atleast_exc
    if atleast_sum is None:
        atleast_sum = def_atleast_sum
    # Filter the events file
    if not os.path.isfile(fname):
        print "Error: Cannot find MISO comparisons file %s" %(fname)
        sys.exit(1)
    if not fname.endswith(".miso_bf"):
        print "Warning: MISO comparisons file %s does not end in " \
              ".miso_bf.  Are you sure it is a comparisons file?" \
              %(fname)
    # Filter comparisons
    # ...
    filtered_df = None
            comparison_counts = \
                self.load_comparisons_counts_from_df(comparisons_df[event_type])
            # Get counts for each read class for sample 1 and sample 2
            comparison_counts = \
                miso_utils.get_counts_by_class("sample1_counts_int",
                                               "sample1",
                                               comparison_counts)
            comparison_counts = \
                miso_utils.get_counts_by_class("sample2_counts_int",
                                               "sample2",
                                               comparison_counts)
            filtered_df = comparison_counts
            # Filter exclusion reads
            # Only apply this to events other than TandemUTRs!
            if "TandemUTR" in event_type:
                atleast_exc = 0
                atleast_const = 5
            # Filter inclusion reads
            filtered_df = \
                filtered_df[filtered_df["sample1_inc_counts"] \
                            | filtered_df["sample2_inc_counts"] \
                            >= atleast_inc]
            # Filter exclusion reads
            filtered_df = \
                filtered_df[filtered_df["sample1_exc_counts"] \
                            | filtered_df["sample2_exc_counts"] \
                            >= atleast_exc]
            # Filter the sum of inclusion and exclusion reads
            sample1_sum = \
                filtered_df["sample1_inc_counts"] + \
                filtered_df["sample1_exc_counts"]
            sample2_sum = \
                filtered_df["sample2_inc_counts"] + \
                filtered_df["sample2_exc_counts"]
            filtered_df = \
                filtered_df[sample1_sum | sample2_sum >= atleast_sum]
            # Filter constitutive reads
            filtered_df = \
                filtered_df[filtered_df["sample1_const_counts"] \
                            | filtered_df["sample2_const_counts"] \
                            >= atleast_const]
            self.filtered_events[event_type] = filtered_df
Ejemplo n.º 47
0
def run(settings, logs_outdir,
        use_cluster=True,
        base_delay=10,
        # Batch delay (20 mins by default)
        batch_delay=60*20,
        delay_every_n_jobs=30,
        dry_run=False,
        event_types=None):
    """
    Run MISO on a set of samples.
    """
    settings_filename = utils.pathify(settings)
    if event_types is not None:
        print "Only running MISO on event types: ", event_types
    misowrap_obj = mw.MISOWrap(settings_filename,
                               logs_outdir,
                               logger_label="run")
    output_dir = misowrap_obj.miso_outdir
    bam_files = misowrap_obj.bam_files
    read_len = misowrap_obj.read_len
    overhang_len = misowrap_obj.overhang_len
    events_dir = misowrap_obj.miso_events_dir
    single_end = True
    if misowrap_obj.insert_lens_dir is not None:
        insert_lens_dir = misowrap_obj.insert_lens_dir
        misowrap_obj.logger.info("Running in paired-end mode...")
        misowrap_obj.logger.info(" - Insert length directory: %s" \
                                 %(insert_lens_dir))
        single_end = False
    else:
        misowrap_obj.logger.info("Running in single-end mode...")        
    run_events_analysis = misowrap_obj.run_events_cmd
    event_types_dirs = \
        miso_utils.get_event_types_dirs(misowrap_obj.settings_info)
    miso_settings_filename = misowrap_obj.miso_settings_filename
    n = 0
    for bam_input in bam_files:
        bam_filename, sample_label = bam_input
        bam_filename = utils.pathify(bam_filename)
        misowrap_obj.logger.info("Processing: %s" %(bam_filename))
        for event_type_dir in event_types_dirs:
            event_type = os.path.basename(event_type_dir)
            if event_types is not None:
                if event_type not in event_types:
                    print "Skipping event type: %s" %(event_type)
                    continue
            print "  - Using event dir: %s" %(event_type_dir)
            miso_cmd = "%s" %(run_events_analysis)
            bam_basename = os.path.basename(bam_filename)
            # Output directory for sample
            sample_output_dir = os.path.join(output_dir, 
                                             sample_label,
                                             event_type)
            # Pass sample to MISO along with event
            miso_cmd += " --run %s %s" %(event_type_dir,
                                         bam_filename)
            if not single_end:
                insert_len_filename = \
                    os.path.join(insert_lens_dir,
                                 "%s.insert_len" %(bam_basename))
                misowrap_obj.logger.info("Reading paired-end parameters " \
                                         "from file...")
                misowrap_obj.logger.info("  - PE file: %s" \
                                         %(insert_len_filename))
                pe_params = miso_utils.read_pe_params(insert_len_filename)
                # Paired-end parameters
                miso_cmd += " --paired-end %.2f %.2f" %(pe_params["mean"],
                                                        pe_params["sdev"])
            # Read length
            miso_cmd += " --read-len %d" %(read_len)
            # Overhang length
            miso_cmd += " --overhang-len %d" %(overhang_len)
            # Prefilter?
            if misowrap_obj.prefilter_miso:
                miso_cmd += " --prefilter"
            # Output directory
            miso_cmd += " --output-dir %s" %(sample_output_dir)
            # Use cluster
            if misowrap_obj.use_cluster:
                miso_cmd += " --use-cluster"
                miso_cmd += " --chunk-jobs %d" %(misowrap_obj.chunk_jobs)
            # Settings
            miso_cmd += " --settings %s" %(miso_settings_filename)
            misowrap_obj.logger.info("Executing: %s" %(miso_cmd))
            job_name = "%s_%s" %(sample_label, event_type)
            if use_cluster:
                if not dry_run:
                    misowrap_obj.my_cluster.launch_job(miso_cmd,
                                                       job_name,
                                                       ppn=1)
                if n == delay_every_n_jobs:
                    # Larger delay everytime we've submitted n jobs
                    misowrap_obj.logger.info("Submitted %d jobs, now waiting %.2f mins." \
                                             %(n, batch_delay / 60.))
                    time.sleep(batch_delay)
                    n = 0
                time.sleep(base_delay)
            else:
                if not dry_run:
                    os.system(miso_cmd)
            n += 1
Ejemplo n.º 48
0
                filtered_df["sample2_exc_counts"]
            filtered_df = \
                filtered_df[sample1_sum | sample2_sum >= atleast_sum]
            # Filter constitutive reads
            filtered_df = \
                filtered_df[filtered_df["sample1_const_counts"] \
                            | filtered_df["sample2_const_counts"] \
                            >= atleast_const]
            self.filtered_events[event_type] = filtered_df
    
    if not dry_run:
        # Call filtered comparisons here
        pass
    # Add gene information
    if get_genes_from_gff is not None:
        gene_table_fname = utils.pathify(get_genes_from_gff)
        print "Adding gene information from %s" %(gene_table_fname)
        if not os.path.isfile(gene_table_fname):
            print "Error: GFF file %s not found." %(gene_table_fname)
            sys.exit(1)
        events_to_genes = get_events_to_genes(gene_table_fname)
        
        

def main():
    argh.dispatch_commands([
        filter_comparisons,
        combine_comparisons
    ])
    
Ejemplo n.º 49
0
def compare(settings,
            logs_outdir,
            delay=5,
            dry_run=False):
    """
    Run a MISO samples comparison between all pairs of samples.
    """
    settings_filename = utils.pathify(settings)
    misowrap_obj = mw.MISOWrap(settings_filename,
                               logs_outdir,
                               logger_label="compare")
    bam_files = misowrap_obj.bam_files
    sample_labels = misowrap_obj.sample_labels
    read_len = misowrap_obj.read_len
    overhang_len = misowrap_obj.overhang_len
    miso_bin_dir = misowrap_obj.miso_bin_dir
    miso_output_dir = misowrap_obj.miso_outdir
    comparison_groups = misowrap_obj.comparison_groups
    comparisons_dir = misowrap_obj.comparisons_dir
    utils.make_dir(comparisons_dir)
    misowrap_obj.logger.info("Running MISO comparisons...")
    ##
    ## Compute comparisons between all pairs
    ## in a sample group
    ##
    for comp_group in comparison_groups:
        sample_pairs = utils.get_pairwise_comparisons(comp_group)
        print "  - Total of %d comparisons" %(len(sample_pairs))
        for sample1, sample2 in sample_pairs:
            # For each pair of samples, compare their output
            # along each event type
            misowrap_obj.logger.info("Comparing %s %s" %(sample1,
                                                         sample2))
            # Directories for each sample
            sample1_dir = os.path.join(miso_output_dir,
                                       sample1)
            sample2_dir = os.path.join(miso_output_dir,
                                       sample2)
            for event_type in misowrap_obj.event_types:
                sample1_event_dir = os.path.join(sample1_dir,
                                                 event_type)
                sample2_event_dir = os.path.join(sample2_dir,
                                                 event_type)
                job_name = "compare_%s_%s_%s" %(sample1,
                                                sample2,
                                                event_type)
                event_comparisons_dir = \
                    os.path.join(comparisons_dir,
                                 event_type)
                compare_cmd = "%s --compare-samples %s %s %s " \
                    "--comparison-labels %s %s" \
                    %(misowrap_obj.compare_miso_cmd,
                      sample1_event_dir,
                      sample2_event_dir,
                      event_comparisons_dir,
                      sample1,
                      sample2)
                misowrap_obj.logger.info("Executing: %s" %(compare_cmd))
                if misowrap_obj.use_cluster:
                    if not dry_run:
                        misowrap_obj.my_cluster.launch_job(compare_cmd,
                                                           job_name,
                                                           ppn=1)
                        time.sleep(delay)
                else:
                    if not dry_run:
                        os.system(compare_cmd)
Ejemplo n.º 50
0
def run_miso_on_samples(settings_filename, output_dir,
                        use_cluster=True,
                        delay=120):
    """
    Run MISO on a set of samples.
    """
    misowrap_obj = MISOWrap(settings_filename, output_dir,
                            logger_label="run")
    bam_files = misowrap_obj.bam_files
    read_len = misowrap_obj.read_len
    overhang_len = misowrap_obj.overhang_len
    events_dir = misowrap_obj.miso_events_dir
    single_end = True
    if misowrap_obj.insert_lens_dir is not None:
        insert_lens_dir = misowrap_obj.insert_lens_dir
        misowrap_obj.logger.info("Running in paired-end mode...")
        misowrap_obj.logger.info(" - Insert length directory: %s" \
                                 %(insert_lens_dir))
        single_end = False
    else:
        misowrap_obj.logger.info("Running in single-end mode...")        
    run_events_analysis = misowrap_obj.run_events_cmd
    event_types_dirs = \
        miso_utils.get_event_types_dirs(misowrap_obj.settings_info)
    miso_settings_filename = misowrap_obj.miso_settings_filename
    for bam_input in bam_files:
        bam_filename, sample_label = bam_input
        bam_filename = utils.pathify(bam_filename)
        misowrap_obj.logger.info("Processing: %s" %(bam_filename))
        for event_type_dir in event_types_dirs:
            event_type = os.path.basename(event_type_dir)
            print "  - Using event dir: %s" %(event_type_dir)
            miso_cmd = "%s" %(run_events_analysis)
            bam_basename = os.path.basename(bam_filename)
            # Output directory for sample
            sample_output_dir = os.path.join(output_dir, 
                                             sample_label,
                                             event_type)
            # Pass sample to MISO along with event
            miso_cmd += " --compute-genes-psi %s %s" %(event_type_dir,
                                                       bam_filename)
            if not single_end:
                insert_len_filename = \
                    os.path.join(insert_lens_dir,
                                 "%s.insert_len" %(bam_basename))
                misowrap_obj.logger.info("Reading paired-end parameters " \
                                         "from file...")
                misowrap_obj.logger.info("  - PE file: %s" \
                                         %(insert_len_filename))
                pe_params = miso_utils.read_pe_params(insert_len_filename)
                # Paired-end parameters
                miso_cmd += " --paired-end %.2f %.2f" %(pe_params["mean"],
                                                        pe_params["sdev"])
            # Read length
            miso_cmd += " --read-len %d" %(read_len)
            # Overhang length
            miso_cmd += " --overhang-len %d" %(overhang_len)
            # Prefilter?
            if misowrap_obj.prefilter_miso:
                miso_cmd += " --prefilter"
            # Output directory
            miso_cmd += " --output-dir %s" %(sample_output_dir)
            # Use cluster
            if misowrap_obj.use_cluster:
                miso_cmd += " --use-cluster"
                miso_cmd += " --chunk-jobs %d" %(misowrap_obj.chunk_jobs)
            # Settings
            miso_cmd += " --settings %s" %(miso_settings_filename)
            misowrap_obj.logger.info("Executing: %s" %(miso_cmd))
            job_name = "%s_%s" %(sample_label, event_type)
            if use_cluster:
                misowrap_obj.my_cluster.launch_job(miso_cmd,
                                                   job_name)
                time.sleep(delay)
            else:
                os.system(miso_cmd)