def plot_density(pickle_filename, bam_filename): print "Plotting density along alternative isoform" print " - pickle: %s" %(pickle_filename) print " - BAM: %s" %(bam_filename) gff_genes = gff_utils.load_indexed_gff_file(pickle_filename) #bamfile = pysam.Samfile(bam_filename, "rb") plot_gene(gff_genes) plt.show() print "gff_genes: ", gff_genes
def main(): from optparse import OptionParser parser = OptionParser() ## ## Two isoform Psi ## parser.add_option("--compute-two-iso-psi", dest="two_iso_psi_files", nargs=2, default=None, help="Compute Psi using MISO for a given set of two-isoform events. " "Expects two arguments: the first is the set of events (in JSON/Pickle format), " "the second is an output directory where estimated Psi values will " "be outputted.") ## ## Multiple isoform Psi ## # parser.add_option("--compute-multi-iso-psi", dest="multi_iso_psi_files", nargs=3, default=None, # help="Compute Psi using for a given multi-isoform gene. Expects three arguments: " # "the first is a file with the isoform lengths. The second is a file with the reads " # "aligned to the isoform. The third is an output directory.") parser.add_option("--compute-gene-psi", dest="compute_gene_psi", nargs=4, default=None, help="Compute Psi using for a given multi-isoform gene. Expects four arguments: " "the first is a gene ID or set of comma-separated (no spaces) gene IDs, " "the second is a GFF indexed file with the gene information, the third is a sorted and " "indexed BAM file with reads aligned to the gene, and the fourth is an output directory.") parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None, help="Run in paired-end mode. Takes a mean and standard deviation " "for the fragment length distribution (assumed to have discretized " "normal form.)") ## ## Psi utilities ## parser.add_option("--compare-samples", dest="samples_to_compare", nargs=3, default=None, help="Compute comparison statistics between the two given samples. " "Expects three directories: the first is sample1's MISO output, " "the second is sample2's MISO output, and the third is the directory where " "results of the sample comparison will be outputted.") parser.add_option("--run-two-iso-event", dest="run_two_iso_event", nargs=3, default=None, help="Run MISO on two isoform event, given an event name, an events file " "(in JSON/Pickle format) and an output directory.") parser.add_option("--summarize-samples", dest="summarize_samples", nargs=2, default=None, help="Compute summary statistics of the given set of samples. " "Expects a directory with MISO output and a directory to output " "summary file to.") parser.add_option("--summarize-multi-iso-samples", dest="summarize_samples", nargs=2, default=None, help="Compute summary statistics of the given set of samples from multi-isoform runs. " "Expects a directory with MISO output and a directory to output summary file to.") parser.add_option("--pool-comparisons", dest="pool_comparisons", nargs=2, default=None, help="Pool comparisons files into a single file. Expects a comparisons directory " "generated by MISO and an output directory, and event type provided with --event-type.") parser.add_option("--use-cluster", action="store_true", dest="use_cluster", default=False) parser.add_option("--chunk-jobs", dest="chunk_jobs", default=False, type="int", help="Size (in number of events) of each job to chunk events file into. " "Only applies when running on cluster.") parser.add_option("--settings-filename", dest="settings_filename", default=os.path.join(miso_path, "settings", "miso_settings.txt"), help="Filename specifying MISO settings.") parser.add_option("--read-len", dest="read_len", type="int", default=None) parser.add_option("--overhang-len", dest="overhang_len", type="int", default=None) parser.add_option("--event-type", dest="event_type", default=None, help="Event type of two-isoform events (e.g. 'SE', 'RI', 'A3SS', ...)") ## ## Gene utilities ## parser.add_option("--view-gene", dest="view_gene", nargs=1, default=None, help="View the contents of a gene/event that has been indexed. "\ "Takes as input an indexed (.pickle) filename.") (options, args) = parser.parse_args() ## ## Load the settings file ## Settings.load(os.path.expanduser(options.settings_filename)) if options.pool_comparisons != None: if options.event_type == None: print "Error: Must provide --event-type to pool comparisons" sys.exit(1) comparison_dir = os.path.abspath(os.path.expanduser(options.pool_comparisons[0])) output_dir = os.path.abspath(os.path.expanduser(options.pool_comparisons[1])) pool_comparisons(comparison_dir, options.event_type, output_dir) if options.samples_to_compare: sample1_dirname = os.path.abspath(options.samples_to_compare[0]) sample2_dirname = os.path.abspath(options.samples_to_compare[1]) output_dirname = os.path.abspath(options.samples_to_compare[2]) if not os.path.isdir(output_dirname): print "Making comparisons directory: %s" %(output_dirname) os.makedirs(output_dirname) ht.output_samples_comparison(sample1_dirname, sample2_dirname, output_dirname) if options.run_two_iso_event: if options.read_len == None or options.overhang_len == None: print "Error: must provide --read-len and --overhang-len to run." sys.exit(1) if options.use_cluster: print "Use cluster option not supported for running on a single event." sys.exit(1) # convert paths to absolute path names event_name = options.run_two_iso_event[0] events_filename = os.path.abspath(options.run_two_iso_event[1]) psi_outdir = os.path.abspath(os.path.expanduser(options.run_two_iso_event[2])) + '/' miso_events = as_events.MISOEvents(2, options.event_type, from_file=events_filename) run_two_iso_event(event_name, options.event_type, miso_events, psi_outdir, options.read_len, options.overhang_len) # if options.inspect_events: # print "Loading events from: %s" %(options.inspect_events) # miso_events = as_events.MISOEvents(2, options.event_type, from_file=options.inspect_events) # print " - Total of %d events." %(len(miso_events.events)) if options.two_iso_psi_files: if options.read_len == None or options.overhang_len == None: print "Error: must provide --read-len and --overhang-len to run." sys.exit(1) # convert paths to absolute path names events_filename = os.path.abspath(options.two_iso_psi_files[0]) psi_outdir = os.path.abspath(options.two_iso_psi_files[1]) + '/' if options.use_cluster: run_two_iso_on_cluster(miso_path, events_filename, options.event_type, psi_outdir, options.read_len, options.overhang_len, chunk_jobs=options.chunk_jobs) else: if options.chunk_jobs: print "Error: Chunking jobs only applies when using the --use-cluster option " \ "to run MISO on cluster." sys.exit(1) compute_two_iso_psi(events_filename, options.event_type, psi_outdir, options.read_len, options.overhang_len) ## ## Multiple isoforms interface based on SAM files ## if options.compute_gene_psi != None: if options.read_len == None: print "Error: must provide --read-len." sys.exit(1) paired_end = None if options.paired_end != None: paired_end = float(options.paired_end[0]), \ float(options.paired_end[1]) overhang_len = 1 if options.overhang_len != None: overhang_len = options.overhang_len # Genes to run on from GFF gene_ids = options.compute_gene_psi[0].split(",") # GFF filename describing genes gff_filename = os.path.abspath(os.path.expanduser(options.compute_gene_psi[1])) # BAM filename with reads bam_filename = os.path.abspath(os.path.expanduser(options.compute_gene_psi[2])) # Output directory output_dir = os.path.abspath(os.path.expanduser(options.compute_gene_psi[3])) compute_gene_psi(gene_ids, gff_filename, bam_filename, output_dir, options.read_len, overhang_len, paired_end=paired_end, event_type=options.event_type) ## ## Summarizing samples ## if options.summarize_samples: samples_dir = os.path.abspath(os.path.expanduser(options.summarize_samples[0])) samples_label = os.path.basename(os.path.expanduser(samples_dir)) assert(len(samples_label) >= 1) summary_output_dir = os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]), 'summary')) if not os.path.isdir(summary_output_dir): os.makedirs(summary_output_dir) summary_filename = os.path.join(summary_output_dir, '%s.miso_summary' %(samples_label)) summarize_sampler_results(samples_dir, summary_filename) if options.view_gene != None: indexed_gene_filename = os.path.abspath(os.path.expanduser(options.view_gene)) print "Viewing genes in %s" %(indexed_gene_filename) gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename) if gff_genes == None: print "No genes." return for gene_id, gene_info in gff_genes.iteritems(): print "Gene %s" %(gene_id) gene_obj = gene_info['gene_object'] print " - Gene object: ", gene_obj print "==" print "Isoforms: " for isoform in gene_obj.isoforms: print " - ", isoform print "==" print "Exons: " for exon in gene_obj.parts: print " - ", exon
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ if not os.path.isdir(output_dir): os.makedirs(output_dir) if not os.path.exists(gff_index_filename): print "Error: no such GFF file as %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] min_event_reads = Settings.get_min_event_reads() if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF # print "Loading genes from indexed GFF..." # t1 = time.time() gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # t2 = time.time() # print " - Loading took: %.2f seconds" %(t2 - t1) for gene_id, gene_info in gff_genes.iteritems(): if gene_id not in gene_ids: # Skip genes that we were not asked to run on continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Find the most inclusive transcription start and end sites for each gene tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] # Load the BAM file bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Fetch reads aligning to the gene boundaries gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Align the reads to the isoforms reads = sam_utils.sam_reads_to_isoforms(gene_reads, gene_obj, read_len, overhang_len, paired_end=paired_end) num_raw_reads = len(reads) # Skip gene if none of the reads align to gene boundaries if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue reads = array(reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) if not os.path.isdir(chrom_dir): os.makedirs(chrom_dir) output_filename = os.path.join(chrom_dir, gene_obj.label) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, burn_in=burn_in, lag=lag)
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ if not os.path.isdir(output_dir): os.makedirs(output_dir) num_genes = len(gene_ids) print "Computing Psi for %d genes..." % (num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" % (gff_index_filename) print " - BAM: %s" % (bam_filename) print " - Outputting to: %s" % (output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] min_event_reads = Settings.get_min_event_reads() if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF # print "Loading genes from indexed GFF..." # t1 = time.time() gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # t2 = time.time() # print " - Loading took: %.2f seconds" %(t2 - t1) for gene_id, gene_info in gff_genes.iteritems(): if gene_id not in gene_ids: # Skip genes that we were not asked to run on continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Find the most inclusive transcription start and end sites for each gene tx_start, tx_end = gff_utils.get_inclusive_txn_bounds( gene_info['hierarchy'][gene_id]) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] # Load the BAM file bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Fetch reads aligning to the gene boundaries gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Align the reads to the isoforms reads = sam_utils.sam_reads_to_isoforms(gene_reads, gene_obj, paired_end=paired_end) num_raw_reads = len(reads) # Skip gene if none of the reads align to gene boundaries if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue reads = array(reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = miso.get_paired_end_sampler_params( num_isoforms, mean_frag_len, frag_variance, read_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params( num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) if not os.path.isdir(chrom_dir): os.makedirs(chrom_dir) output_filename = os.path.join(chrom_dir, gene_obj.label) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, burn_in=burn_in, lag=lag)