Esempio n. 1
0
 def get_event_data_as_stream(self, event_name):
     """
     Get data for given event. If there's no data, return None.
     """
     # The name of the event as stored in database (if using
     # compressed event IDs, this would be a misocomp ID)
     event_to_query = event_name
     # Error checking: if the database is compressed but we're not
     # given a mapping, this is a major error
     if self.is_db_events_compressed and \
        ((self.comp_to_uncomp is None) or (self.uncomp_to_comp is None)):
        raise Exception, "The database contains compressed IDs but no " \
                         "mapping (.shelve) file was given."
     # If we have a compressed event representation in database and
     # the event given is uncompressed, then look at the
     # compressed representation
     if self.is_db_events_compressed and \
        (not misc_utils.is_compressed_name(event_name)):
        if event_name not in self.uncomp_to_comp:
            return None
        event_to_query = self.uncomp_to_comp[event_name]
     # If the event given is compressed and the database representation
     # is *uncompressed*, then uncompress the event
     elif (not self.is_db_events_compressed) and \
        misc_utils.is_compressed_name(event_name):
        # If there's no compressed mapping, we can't
        # use this event
        if self.comp_to_uncomp is None:
            raise Exception, "Cannot get compressed event %s from " \
                             "uncompressed database." %(event_name)
        if event_name not in self.comp_to_uncomp:
            return None
        event_to_query = self.comp_to_uncomp[event_name]
     c = self.conn.cursor()
     results = \
       c.execute("SELECT * from %s WHERE event_name=\'%s\'" \
                 %(self.table_name,
                   event_to_query))
     rows = results.fetchall()
     if len(rows) == 0:
         # Event not found
         return None
     if len(rows) > 1:
         raise Exception, \
           "More than one entry for event %s" %(event_to_query)
     event_name, psi_vals_and_scores, header = rows[0]
     # If we're given a mapping to compressed events,
     # return the event name as the *uncompressed* event
     # name
     event_data = "%s\n%s\n" %(header,
                               psi_vals_and_scores)
     event_stream = StringIO.StringIO(event_data)
     return event_stream
Esempio n. 2
0
 def get_event_data_as_stream(self, event_name):
     """
     Get data for given event. If there's no data, return None.
     """
     # The name of the event as stored in database (if using
     # compressed event IDs, this would be a misocomp ID)
     event_to_query = event_name
     # Error checking: if the database is compressed but we're not
     # given a mapping, this is a major error
     if self.is_db_events_compressed and \
        ((self.comp_to_uncomp is None) or (self.uncomp_to_comp is None)):
         raise Exception, "The database contains compressed IDs but no " \
                          "mapping (.shelve) file was given."
     # If we have a compressed event representation in database and
     # the event given is uncompressed, then look at the
     # compressed representation
     if self.is_db_events_compressed and \
        (not misc_utils.is_compressed_name(event_name)):
         if event_name not in self.uncomp_to_comp:
             return None
         event_to_query = self.uncomp_to_comp[event_name]
     # If the event given is compressed and the database representation
     # is *uncompressed*, then uncompress the event
     elif (not self.is_db_events_compressed) and \
        misc_utils.is_compressed_name(event_name):
         # If there's no compressed mapping, we can't
         # use this event
         if self.comp_to_uncomp is None:
             raise Exception, "Cannot get compressed event %s from " \
                              "uncompressed database." %(event_name)
         if event_name not in self.comp_to_uncomp:
             return None
         event_to_query = self.comp_to_uncomp[event_name]
     c = self.conn.cursor()
     results = \
       c.execute("SELECT * from %s WHERE event_name=\'%s\'" \
                 %(self.table_name,
                   event_to_query))
     rows = results.fetchall()
     if len(rows) == 0:
         # Event not found
         return None
     if len(rows) > 1:
         raise Exception, \
           "More than one entry for event %s" %(event_to_query)
     event_name, psi_vals_and_scores, header = rows[0]
     # If we're given a mapping to compressed events,
     # return the event name as the *uncompressed* event
     # name
     event_data = "%s\n%s\n" % (header, psi_vals_and_scores)
     event_stream = StringIO.StringIO(event_data)
     return event_stream
Esempio n. 3
0
 def is_event_name_compressed(self):
     """
     Determine if the events in the database are compressed
     or not.
     """
     c = self.conn.cursor()
     results = \
       c.execute("SELECT * from %s" %(self.table_name))
     first_result = results.fetchone()
     event_name = str(first_result[0])
     is_comp = misc_utils.is_compressed_name(event_name)
     return is_comp
Esempio n. 4
0
 def is_event_name_compressed(self):
     """
     Determine if the events in the database are compressed
     or not.
     """
     c = self.conn.cursor()
     results = \
       c.execute("SELECT * from %s" %(self.table_name))
     first_result = results.fetchone()
     event_name = str(first_result[0])
     is_comp = misc_utils.is_compressed_name(event_name)
     return is_comp
Esempio n. 5
0
 def get_all_event_names(self):
     """
     Return all event names in current samples dir.
     """
     all_event_names = []
     for curr_fname in self.all_filenames:
         if curr_fname.endswith(".miso"):
             # It's a regular .miso plain text file
             event_name = \
               get_event_name(curr_fname,
                              use_compressed_map=self.compressed_ids_to_genes)
             # Record event name and its mapping to a .miso file
             all_event_names.append(event_name)
             self.event_names_to_fnames[event_name] = curr_fname
         elif miso_db.is_miso_db_fname(curr_fname):
             # It's a MISO database file, so load all the event
             # names in that file
             curr_db = \
               miso_db.MISODatabase(curr_fname,
                                    comp_to_uncomp=self.compressed_ids_to_genes)
             # Record event name and its mapping to the chromosome's
             # .miso_db file
             for curr_event_name in curr_db.get_all_event_names():
                 curr_event_name = str(curr_event_name)
                 event_name_to_use = curr_event_name
                 # If we're given a mapping of compressed IDs, use the
                 # mapping to get the uncompressed event name
                 if self.compressed_ids_to_genes is not None:
                     # The internal database representation of compressed
                     # index databases are compressed IDs, so if the
                     # ID is uncompressed it must be converted to a
                     # compressed one.
                     if not misc_utils.is_compressed_name(curr_event_name):
                         event_name_to_use = \
                           str(curr_db.uncomp_to_comp[curr_event_name])
                 all_event_names.append(event_name_to_use)
                 self.event_names_to_fnames[event_name_to_use] = curr_fname
     return all_event_names
Esempio n. 6
0
 def get_all_event_names(self):
     """
     Return all event names in current samples dir.
     """
     all_event_names = []
     for curr_fname in self.all_filenames:
         if curr_fname.endswith(".miso"):
             # It's a regular .miso plain text file
             event_name = \
               get_event_name(curr_fname,
                              use_compressed_map=self.compressed_ids_to_genes)
             # Record event name and its mapping to a .miso file
             all_event_names.append(event_name)
             self.event_names_to_fnames[event_name] = curr_fname
         elif miso_db.is_miso_db_fname(curr_fname):
             # It's a MISO database file, so load all the event
             # names in that file
             curr_db = \
               miso_db.MISODatabase(curr_fname,
                                    comp_to_uncomp=self.compressed_ids_to_genes)
             # Record event name and its mapping to the chromosome's
             # .miso_db file
             for curr_event_name in curr_db.get_all_event_names():
                 curr_event_name = str(curr_event_name)
                 event_name_to_use = curr_event_name
                 # If we're given a mapping of compressed IDs, use the
                 # mapping to get the uncompressed event name
                 if self.compressed_ids_to_genes is not None:
                     # The internal database representation of compressed
                     # index databases are compressed IDs, so if the
                     # ID is uncompressed it must be converted to a
                     # compressed one.
                     if not misc_utils.is_compressed_name(curr_event_name):
                         event_name_to_use = \
                           str(curr_db.uncomp_to_comp[curr_event_name])
                 all_event_names.append(event_name_to_use)
                 self.event_names_to_fnames[event_name_to_use] = curr_fname
     return all_event_names
Esempio n. 7
0
def summarize_sampler_results(samples_dir,
                              summary_filename,
                              use_compressed=None):
    """
    Given a set of samples from MISO, output a summary file.
    """
    summary_file = open(summary_filename, 'w')
    header_fields = [
        "event_name",
        "miso_posterior_mean",
        "ci_low",
        "ci_high",
        "isoforms",
        "counts",
        "assigned_counts",
        # Fields related to gene/event
        "chrom",
        "strand",
        "mRNA_starts",
        "mRNA_ends"
    ]
    summary_header = "%s\n" % ("\t".join(header_fields))
    summary_file.write(summary_header)
    print "Loading events from: %s" % (samples_dir)
    print "Writing summary to: %s" % (summary_filename)
    samples_obj = MISOSamples(samples_dir, use_compressed=use_compressed)
    num_events = 0

    for event_name in samples_obj.all_event_names:
        samples_results = samples_obj.get_event_samples(event_name)
        if samples_results is None:
            print "WARNING: Skipping %s" % (event_name)
            # Skip files that could not be parsed
            continue
        # If we're not given a mapping to compressed IDs, check
        # that the event IDs do not look compressed
        if misc_utils.is_compressed_name(event_name) and \
           (use_compressed is None):
            print "WARNING: %s looks like a compressed id, but no mapping file " \
                  "from compressed IDs to event IDs was given! Try: --use-compressed" \
                  %(event_name)
        # Load header/parameters information
        samples = samples_results[0]
        header = samples_results[1]
        header = header[0]
        params = parse_sampler_params_from_header(header)
        # Get counts information from header
        counts_info = samples_results[5]
        shape_len = len(shape(samples))
        if shape_len < 2:
            print "WARNING: Skipping %s -- mishaped file" % (event_name)
            continue
        num_samples, num_isoforms = shape(samples)
        output_fields = format_credible_intervals(event_name, samples)

        # Add isoforms information to output fields
        isoforms_field = get_isoforms_from_header(header)
        output_fields.append(isoforms_field)

        # Add counts information to output fields
        output_fields.append(counts_info['counts'])
        output_fields.append(counts_info['assigned_counts'])

        gene_info = get_gene_info_from_params(params)
        output_fields.append(gene_info["chrom"])
        output_fields.append(gene_info["strand"])
        output_fields.append(gene_info["mRNA_starts"])
        output_fields.append(gene_info["mRNA_ends"])

        output_line = "%s\n" % ("\t".join(output_fields))
        summary_file.write(output_line)
        num_events += 1
    print "  - Summarized a total of %d events." % (num_events)
    summary_file.close()
Esempio n. 8
0
def summarize_sampler_results(samples_dir, summary_filename,
                              use_compressed=None):
    """
    Given a set of samples from MISO, output a summary file.
    """
    summary_file = open(summary_filename, 'w')
    header_fields = ["event_name", "miso_posterior_mean", "ci_low", "ci_high",
                     "isoforms", "counts", "assigned_counts",
                     # Fields related to gene/event
                     "chrom",
                     "strand",
                     "mRNA_starts",
                     "mRNA_ends"]
    summary_header = "%s\n" %("\t".join(header_fields))
    summary_file.write(summary_header)
    print "Loading events from: %s" %(samples_dir)
    print "Writing summary to: %s" %(summary_filename)
    samples_obj = MISOSamples(samples_dir,
                              use_compressed=use_compressed)
    num_events = 0

    for event_name in samples_obj.all_event_names:
        samples_results = samples_obj.get_event_samples(event_name)
        if samples_results is None:
            print "WARNING: Skipping %s" %(event_name)
            # Skip files that could not be parsed
            continue
        # If we're not given a mapping to compressed IDs, check
        # that the event IDs do not look compressed
        if misc_utils.is_compressed_name(event_name) and \
           (use_compressed is None):
            print "WARNING: %s looks like a compressed id, but no mapping file " \
                  "from compressed IDs to event IDs was given! Try: --use-compressed" \
                  %(event_name)
        # Load header/parameters information
        samples = samples_results[0]
        header = samples_results[1]
        header = header[0]
        params = parse_sampler_params_from_header(header)
        # Get counts information from header
        counts_info = samples_results[5]
        shape_len = len(shape(samples))
        if shape_len < 2:
            print "WARNING: Skipping %s -- mishaped file" %(samples_filename)
            continue
        num_samples, num_isoforms = shape(samples)
        output_fields = format_credible_intervals(event_name, samples)
            
        # Add isoforms information to output fields
        isoforms_field = get_isoforms_from_header(header)
        output_fields.append(isoforms_field)

        # Add counts information to output fields
        output_fields.append(counts_info['counts'])
        output_fields.append(counts_info['assigned_counts'])

        gene_info = get_gene_info_from_params(params)
        output_fields.append(gene_info["chrom"])
        output_fields.append(gene_info["strand"])
        output_fields.append(gene_info["mRNA_starts"])
        output_fields.append(gene_info["mRNA_ends"])
        
        output_line = "%s\n" %("\t".join(output_fields))
	summary_file.write(output_line)
	num_events += 1
    print "  - Summarized a total of %d events." %(num_events)
    summary_file.close()
Esempio n. 9
0
def summarize_sampler_results(samples_dir,
                              summary_filename,
                              use_compressed=None):
    """
    Given a set of samples from MISO, output a summary file.
    """
    summary_file = open(summary_filename, 'w')
    header_fields = [
        "event_name",
        "miso_posterior_mean",
        "ci_low",
        "ci_high",
        "isoforms",
        "counts",
        "assigned_counts",
        # Fields related to gene/event
        "chrom",
        "strand",
        "mRNA_starts",
        "mRNA_ends"
    ]
    summary_header = "%s\n" % ("\t".join(header_fields))
    summary_file.write(summary_header)
    print "Loading events from: %s" % (samples_dir)
    print "Writing summary to: %s" % (summary_filename)
    all_filenames = get_samples_dir_filenames(samples_dir)
    num_events = 0

    compressed_ids_to_genes = {}
    if use_compressed is not None:
        print "  - Loading compressed IDs mapping from: %s" % (use_compressed)
        # Load mapping from gene IDs to their hashes
        compressed_ids_to_genes = misc_utils.load_compressed_ids_to_genes(
            use_compressed)

    for samples_filename in all_filenames:
        # Parse sampler parameters
        params = parse_sampler_params(samples_filename)
        event_name = get_event_name(samples_filename)

        if event_name == None:
            print "Skipping %s" % (samples_filename)
            continue
        # If using compressed event IDs, convert event
        # to its real event ID
        if use_compressed is not None:
            if event_name not in compressed_ids_to_genes:
                print "Error: Compressed id %s does not map to any event name." \
                    %(event_name)
                sys.exit(1)
            event_name = compressed_ids_to_genes[event_name]
        else:
            # If we're not given a mapping to compressed IDs, check
            # that the event IDs do not look compressed
            if misc_utils.is_compressed_name(event_name):
                print "WARNING: %s looks like a compressed id, but no mapping file " \
                    "from compressed IDs to event IDs was given! Try: --use-compressed" \
                    %(event_name)

        # Load samples and header information
        samples_results = load_samples(samples_filename)
        if samples_results is None:
            print "Skipping %s" % (samples_filename)
            # Skip files that could not be parsed
            continue
        samples = samples_results[0]
        header = samples_results[1]
        header = header[0]

        # Get counts information from header
        counts_info = samples_results[5]

        shape_len = len(shape(samples))
        if shape_len < 2:
            print "Skipping %s" % (samples_filename)
            continue
        num_samples, num_isoforms = shape(samples)
        output_fields = format_credible_intervals(event_name, samples)

        # Add isoforms information to output fields
        isoforms_field = get_isoforms_from_header(header)
        output_fields.append(isoforms_field)

        # Add counts information to output fields
        output_fields.append(counts_info['counts'])
        output_fields.append(counts_info['assigned_counts'])

        gene_info = get_gene_info_from_params(params)
        output_fields.append(gene_info["chrom"])
        output_fields.append(gene_info["strand"])
        output_fields.append(gene_info["mRNA_starts"])
        output_fields.append(gene_info["mRNA_ends"])

        output_line = "%s\n" % ("\t".join(output_fields))
        summary_file.write(output_line)
        num_events += 1
    print "  - Summarized a total of %d events." % (num_events)
    summary_file.close()