def get_event_data_as_stream(self, event_name): """ Get data for given event. If there's no data, return None. """ # The name of the event as stored in database (if using # compressed event IDs, this would be a misocomp ID) event_to_query = event_name # Error checking: if the database is compressed but we're not # given a mapping, this is a major error if self.is_db_events_compressed and \ ((self.comp_to_uncomp is None) or (self.uncomp_to_comp is None)): raise Exception, "The database contains compressed IDs but no " \ "mapping (.shelve) file was given." # If we have a compressed event representation in database and # the event given is uncompressed, then look at the # compressed representation if self.is_db_events_compressed and \ (not misc_utils.is_compressed_name(event_name)): if event_name not in self.uncomp_to_comp: return None event_to_query = self.uncomp_to_comp[event_name] # If the event given is compressed and the database representation # is *uncompressed*, then uncompress the event elif (not self.is_db_events_compressed) and \ misc_utils.is_compressed_name(event_name): # If there's no compressed mapping, we can't # use this event if self.comp_to_uncomp is None: raise Exception, "Cannot get compressed event %s from " \ "uncompressed database." %(event_name) if event_name not in self.comp_to_uncomp: return None event_to_query = self.comp_to_uncomp[event_name] c = self.conn.cursor() results = \ c.execute("SELECT * from %s WHERE event_name=\'%s\'" \ %(self.table_name, event_to_query)) rows = results.fetchall() if len(rows) == 0: # Event not found return None if len(rows) > 1: raise Exception, \ "More than one entry for event %s" %(event_to_query) event_name, psi_vals_and_scores, header = rows[0] # If we're given a mapping to compressed events, # return the event name as the *uncompressed* event # name event_data = "%s\n%s\n" %(header, psi_vals_and_scores) event_stream = StringIO.StringIO(event_data) return event_stream
def get_event_data_as_stream(self, event_name): """ Get data for given event. If there's no data, return None. """ # The name of the event as stored in database (if using # compressed event IDs, this would be a misocomp ID) event_to_query = event_name # Error checking: if the database is compressed but we're not # given a mapping, this is a major error if self.is_db_events_compressed and \ ((self.comp_to_uncomp is None) or (self.uncomp_to_comp is None)): raise Exception, "The database contains compressed IDs but no " \ "mapping (.shelve) file was given." # If we have a compressed event representation in database and # the event given is uncompressed, then look at the # compressed representation if self.is_db_events_compressed and \ (not misc_utils.is_compressed_name(event_name)): if event_name not in self.uncomp_to_comp: return None event_to_query = self.uncomp_to_comp[event_name] # If the event given is compressed and the database representation # is *uncompressed*, then uncompress the event elif (not self.is_db_events_compressed) and \ misc_utils.is_compressed_name(event_name): # If there's no compressed mapping, we can't # use this event if self.comp_to_uncomp is None: raise Exception, "Cannot get compressed event %s from " \ "uncompressed database." %(event_name) if event_name not in self.comp_to_uncomp: return None event_to_query = self.comp_to_uncomp[event_name] c = self.conn.cursor() results = \ c.execute("SELECT * from %s WHERE event_name=\'%s\'" \ %(self.table_name, event_to_query)) rows = results.fetchall() if len(rows) == 0: # Event not found return None if len(rows) > 1: raise Exception, \ "More than one entry for event %s" %(event_to_query) event_name, psi_vals_and_scores, header = rows[0] # If we're given a mapping to compressed events, # return the event name as the *uncompressed* event # name event_data = "%s\n%s\n" % (header, psi_vals_and_scores) event_stream = StringIO.StringIO(event_data) return event_stream
def is_event_name_compressed(self): """ Determine if the events in the database are compressed or not. """ c = self.conn.cursor() results = \ c.execute("SELECT * from %s" %(self.table_name)) first_result = results.fetchone() event_name = str(first_result[0]) is_comp = misc_utils.is_compressed_name(event_name) return is_comp
def get_all_event_names(self): """ Return all event names in current samples dir. """ all_event_names = [] for curr_fname in self.all_filenames: if curr_fname.endswith(".miso"): # It's a regular .miso plain text file event_name = \ get_event_name(curr_fname, use_compressed_map=self.compressed_ids_to_genes) # Record event name and its mapping to a .miso file all_event_names.append(event_name) self.event_names_to_fnames[event_name] = curr_fname elif miso_db.is_miso_db_fname(curr_fname): # It's a MISO database file, so load all the event # names in that file curr_db = \ miso_db.MISODatabase(curr_fname, comp_to_uncomp=self.compressed_ids_to_genes) # Record event name and its mapping to the chromosome's # .miso_db file for curr_event_name in curr_db.get_all_event_names(): curr_event_name = str(curr_event_name) event_name_to_use = curr_event_name # If we're given a mapping of compressed IDs, use the # mapping to get the uncompressed event name if self.compressed_ids_to_genes is not None: # The internal database representation of compressed # index databases are compressed IDs, so if the # ID is uncompressed it must be converted to a # compressed one. if not misc_utils.is_compressed_name(curr_event_name): event_name_to_use = \ str(curr_db.uncomp_to_comp[curr_event_name]) all_event_names.append(event_name_to_use) self.event_names_to_fnames[event_name_to_use] = curr_fname return all_event_names
def summarize_sampler_results(samples_dir, summary_filename, use_compressed=None): """ Given a set of samples from MISO, output a summary file. """ summary_file = open(summary_filename, 'w') header_fields = [ "event_name", "miso_posterior_mean", "ci_low", "ci_high", "isoforms", "counts", "assigned_counts", # Fields related to gene/event "chrom", "strand", "mRNA_starts", "mRNA_ends" ] summary_header = "%s\n" % ("\t".join(header_fields)) summary_file.write(summary_header) print "Loading events from: %s" % (samples_dir) print "Writing summary to: %s" % (summary_filename) samples_obj = MISOSamples(samples_dir, use_compressed=use_compressed) num_events = 0 for event_name in samples_obj.all_event_names: samples_results = samples_obj.get_event_samples(event_name) if samples_results is None: print "WARNING: Skipping %s" % (event_name) # Skip files that could not be parsed continue # If we're not given a mapping to compressed IDs, check # that the event IDs do not look compressed if misc_utils.is_compressed_name(event_name) and \ (use_compressed is None): print "WARNING: %s looks like a compressed id, but no mapping file " \ "from compressed IDs to event IDs was given! Try: --use-compressed" \ %(event_name) # Load header/parameters information samples = samples_results[0] header = samples_results[1] header = header[0] params = parse_sampler_params_from_header(header) # Get counts information from header counts_info = samples_results[5] shape_len = len(shape(samples)) if shape_len < 2: print "WARNING: Skipping %s -- mishaped file" % (event_name) continue num_samples, num_isoforms = shape(samples) output_fields = format_credible_intervals(event_name, samples) # Add isoforms information to output fields isoforms_field = get_isoforms_from_header(header) output_fields.append(isoforms_field) # Add counts information to output fields output_fields.append(counts_info['counts']) output_fields.append(counts_info['assigned_counts']) gene_info = get_gene_info_from_params(params) output_fields.append(gene_info["chrom"]) output_fields.append(gene_info["strand"]) output_fields.append(gene_info["mRNA_starts"]) output_fields.append(gene_info["mRNA_ends"]) output_line = "%s\n" % ("\t".join(output_fields)) summary_file.write(output_line) num_events += 1 print " - Summarized a total of %d events." % (num_events) summary_file.close()
def summarize_sampler_results(samples_dir, summary_filename, use_compressed=None): """ Given a set of samples from MISO, output a summary file. """ summary_file = open(summary_filename, 'w') header_fields = ["event_name", "miso_posterior_mean", "ci_low", "ci_high", "isoforms", "counts", "assigned_counts", # Fields related to gene/event "chrom", "strand", "mRNA_starts", "mRNA_ends"] summary_header = "%s\n" %("\t".join(header_fields)) summary_file.write(summary_header) print "Loading events from: %s" %(samples_dir) print "Writing summary to: %s" %(summary_filename) samples_obj = MISOSamples(samples_dir, use_compressed=use_compressed) num_events = 0 for event_name in samples_obj.all_event_names: samples_results = samples_obj.get_event_samples(event_name) if samples_results is None: print "WARNING: Skipping %s" %(event_name) # Skip files that could not be parsed continue # If we're not given a mapping to compressed IDs, check # that the event IDs do not look compressed if misc_utils.is_compressed_name(event_name) and \ (use_compressed is None): print "WARNING: %s looks like a compressed id, but no mapping file " \ "from compressed IDs to event IDs was given! Try: --use-compressed" \ %(event_name) # Load header/parameters information samples = samples_results[0] header = samples_results[1] header = header[0] params = parse_sampler_params_from_header(header) # Get counts information from header counts_info = samples_results[5] shape_len = len(shape(samples)) if shape_len < 2: print "WARNING: Skipping %s -- mishaped file" %(samples_filename) continue num_samples, num_isoforms = shape(samples) output_fields = format_credible_intervals(event_name, samples) # Add isoforms information to output fields isoforms_field = get_isoforms_from_header(header) output_fields.append(isoforms_field) # Add counts information to output fields output_fields.append(counts_info['counts']) output_fields.append(counts_info['assigned_counts']) gene_info = get_gene_info_from_params(params) output_fields.append(gene_info["chrom"]) output_fields.append(gene_info["strand"]) output_fields.append(gene_info["mRNA_starts"]) output_fields.append(gene_info["mRNA_ends"]) output_line = "%s\n" %("\t".join(output_fields)) summary_file.write(output_line) num_events += 1 print " - Summarized a total of %d events." %(num_events) summary_file.close()
def summarize_sampler_results(samples_dir, summary_filename, use_compressed=None): """ Given a set of samples from MISO, output a summary file. """ summary_file = open(summary_filename, 'w') header_fields = [ "event_name", "miso_posterior_mean", "ci_low", "ci_high", "isoforms", "counts", "assigned_counts", # Fields related to gene/event "chrom", "strand", "mRNA_starts", "mRNA_ends" ] summary_header = "%s\n" % ("\t".join(header_fields)) summary_file.write(summary_header) print "Loading events from: %s" % (samples_dir) print "Writing summary to: %s" % (summary_filename) all_filenames = get_samples_dir_filenames(samples_dir) num_events = 0 compressed_ids_to_genes = {} if use_compressed is not None: print " - Loading compressed IDs mapping from: %s" % (use_compressed) # Load mapping from gene IDs to their hashes compressed_ids_to_genes = misc_utils.load_compressed_ids_to_genes( use_compressed) for samples_filename in all_filenames: # Parse sampler parameters params = parse_sampler_params(samples_filename) event_name = get_event_name(samples_filename) if event_name == None: print "Skipping %s" % (samples_filename) continue # If using compressed event IDs, convert event # to its real event ID if use_compressed is not None: if event_name not in compressed_ids_to_genes: print "Error: Compressed id %s does not map to any event name." \ %(event_name) sys.exit(1) event_name = compressed_ids_to_genes[event_name] else: # If we're not given a mapping to compressed IDs, check # that the event IDs do not look compressed if misc_utils.is_compressed_name(event_name): print "WARNING: %s looks like a compressed id, but no mapping file " \ "from compressed IDs to event IDs was given! Try: --use-compressed" \ %(event_name) # Load samples and header information samples_results = load_samples(samples_filename) if samples_results is None: print "Skipping %s" % (samples_filename) # Skip files that could not be parsed continue samples = samples_results[0] header = samples_results[1] header = header[0] # Get counts information from header counts_info = samples_results[5] shape_len = len(shape(samples)) if shape_len < 2: print "Skipping %s" % (samples_filename) continue num_samples, num_isoforms = shape(samples) output_fields = format_credible_intervals(event_name, samples) # Add isoforms information to output fields isoforms_field = get_isoforms_from_header(header) output_fields.append(isoforms_field) # Add counts information to output fields output_fields.append(counts_info['counts']) output_fields.append(counts_info['assigned_counts']) gene_info = get_gene_info_from_params(params) output_fields.append(gene_info["chrom"]) output_fields.append(gene_info["strand"]) output_fields.append(gene_info["mRNA_starts"]) output_fields.append(gene_info["mRNA_ends"]) output_line = "%s\n" % ("\t".join(output_fields)) summary_file.write(output_line) num_events += 1 print " - Summarized a total of %d events." % (num_events) summary_file.close()