Exemple #1
0
def plot_density(pickle_filename, bam_filename):
    print "Plotting density along alternative isoform"
    print "  - pickle: %s" %(pickle_filename)
    print "  - BAM: %s" %(bam_filename)
    gff_genes = gff_utils.load_indexed_gff_file(pickle_filename)
    #bamfile = pysam.Samfile(bam_filename, "rb")

    plot_gene(gff_genes)

    plt.show()

    print "gff_genes: ", gff_genes
Exemple #2
0
def main():
    from optparse import OptionParser
    parser = OptionParser()
    ##
    ## Two isoform Psi
    ##
    parser.add_option("--compute-two-iso-psi", dest="two_iso_psi_files", nargs=2, default=None,
		       help="Compute Psi using MISO for a given set of two-isoform events. "
                       "Expects two arguments: the first is the set of events (in JSON/Pickle format), "
                       "the second is an output directory where estimated Psi values will "
		       "be outputted.")

    ##
    ## Multiple isoform Psi
    ##
    # parser.add_option("--compute-multi-iso-psi", dest="multi_iso_psi_files", nargs=3, default=None,
    #                   help="Compute Psi using for a given multi-isoform gene.  Expects three arguments: "
    #                   "the first is a file with the isoform lengths. The second is a file with the reads " 
    #                   "aligned to the isoform. The third is an output directory.")
    parser.add_option("--compute-gene-psi", dest="compute_gene_psi", nargs=4, default=None,
                      help="Compute Psi using for a given multi-isoform gene.  Expects four arguments: "
                      "the first is a gene ID or set of comma-separated (no spaces) gene IDs, "
                      "the second is a GFF indexed file with the gene information, the third is a sorted and "
                      "indexed BAM file with reads aligned to the gene, and the fourth is an output directory.")
    parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None,
                      help="Run in paired-end mode.  Takes a mean and standard deviation "
                      "for the fragment length distribution (assumed to have discretized "
                      "normal form.)")

    ##
    ## Psi utilities
    ##
    parser.add_option("--compare-samples", dest="samples_to_compare", nargs=3, default=None,
		      help="Compute comparison statistics between the two given samples. "
                      "Expects three directories: the first is sample1's MISO output, "
                      "the second is sample2's MISO output, and the third is the directory where "
		      "results of the sample comparison will be outputted.")
    parser.add_option("--run-two-iso-event", dest="run_two_iso_event", nargs=3, default=None,
		      help="Run MISO on two isoform event, given an event name, an events file "
                      "(in JSON/Pickle format) and an output directory.")
    parser.add_option("--summarize-samples", dest="summarize_samples", nargs=2, default=None,
		      help="Compute summary statistics of the given set of samples. "
                      "Expects a directory with MISO output and a directory to output "
                      "summary file to.")
    parser.add_option("--summarize-multi-iso-samples", dest="summarize_samples", nargs=2, default=None,
		      help="Compute summary statistics of the given set of samples from multi-isoform runs. "
                      "Expects a directory with MISO output and a directory to output summary file to.")
    parser.add_option("--pool-comparisons", dest="pool_comparisons", nargs=2, default=None,
		      help="Pool comparisons files into a single file. Expects a comparisons directory "
                      "generated by MISO and an output directory, and event type provided with --event-type.")
    parser.add_option("--use-cluster", action="store_true", dest="use_cluster", default=False)
    parser.add_option("--chunk-jobs", dest="chunk_jobs", default=False, type="int",
		      help="Size (in number of events) of each job to chunk events file into. "
                      "Only applies when running on cluster.")
    parser.add_option("--settings-filename", dest="settings_filename",
                      default=os.path.join(miso_path, "settings", "miso_settings.txt"),
                      help="Filename specifying MISO settings.")
    parser.add_option("--read-len", dest="read_len", type="int", default=None)
    parser.add_option("--overhang-len", dest="overhang_len", type="int", default=None)
    parser.add_option("--event-type", dest="event_type", default=None,
		      help="Event type of two-isoform events (e.g. 'SE', 'RI', 'A3SS', ...)")

    ##
    ## Gene utilities
    ##
    parser.add_option("--view-gene", dest="view_gene", nargs=1, default=None,
                      help="View the contents of a gene/event that has been indexed. "\
                      "Takes as input an indexed (.pickle) filename.")
    (options, args) = parser.parse_args()

    ##
    ## Load the settings file 
    ##
    Settings.load(os.path.expanduser(options.settings_filename))

    if options.pool_comparisons != None:
        if options.event_type == None:
            print "Error: Must provide --event-type to pool comparisons"
            sys.exit(1)

        comparison_dir = os.path.abspath(os.path.expanduser(options.pool_comparisons[0]))
        output_dir = os.path.abspath(os.path.expanduser(options.pool_comparisons[1]))
        pool_comparisons(comparison_dir, options.event_type, output_dir)
    
    if options.samples_to_compare:
	sample1_dirname = os.path.abspath(options.samples_to_compare[0])
	sample2_dirname = os.path.abspath(options.samples_to_compare[1])
	output_dirname = os.path.abspath(options.samples_to_compare[2])
	if not os.path.isdir(output_dirname):
            print "Making comparisons directory: %s" %(output_dirname)
	    os.makedirs(output_dirname)
	ht.output_samples_comparison(sample1_dirname, sample2_dirname,
                                     output_dirname)
	
    if options.run_two_iso_event:
	if options.read_len == None or options.overhang_len == None:
	    print "Error: must provide --read-len and --overhang-len to run."
            sys.exit(1)
            
	if options.use_cluster:
	    print "Use cluster option not supported for running on a single event."
            sys.exit(1)
            
	# convert paths to absolute path names
	event_name = options.run_two_iso_event[0]
	events_filename = os.path.abspath(options.run_two_iso_event[1]) 
	psi_outdir = os.path.abspath(os.path.expanduser(options.run_two_iso_event[2])) + '/'
        
	miso_events = as_events.MISOEvents(2, options.event_type,
                                           from_file=events_filename)
        
	run_two_iso_event(event_name, options.event_type, miso_events, psi_outdir,
			  options.read_len, options.overhang_len)

    # if options.inspect_events:
    #     print "Loading events from: %s" %(options.inspect_events)
    #     miso_events = as_events.MISOEvents(2, options.event_type, from_file=options.inspect_events)
    #     print "  - Total of %d events." %(len(miso_events.events))
	
    if options.two_iso_psi_files:
	if options.read_len == None or options.overhang_len == None:
	    print "Error: must provide --read-len and --overhang-len to run."
            sys.exit(1)

	# convert paths to absolute path names
	events_filename = os.path.abspath(options.two_iso_psi_files[0]) 
	psi_outdir = os.path.abspath(options.two_iso_psi_files[1]) + '/'
	if options.use_cluster:
	    run_two_iso_on_cluster(miso_path, events_filename, options.event_type, psi_outdir,
                                   options.read_len, options.overhang_len,
                                   chunk_jobs=options.chunk_jobs)
	else:
	    if options.chunk_jobs:
		print "Error: Chunking jobs only applies when using the --use-cluster option " \
                      "to run MISO on cluster."
                sys.exit(1)
                
	    compute_two_iso_psi(events_filename, options.event_type, psi_outdir,
				options.read_len, options.overhang_len)

    ##
    ## Multiple isoforms interface based on SAM files
    ##
    if options.compute_gene_psi != None:
        if options.read_len == None:
            print "Error: must provide --read-len."
            sys.exit(1)

        paired_end = None

        if options.paired_end != None:
            paired_end = float(options.paired_end[0]), \
                         float(options.paired_end[1])

        overhang_len = 1

        if options.overhang_len != None:
            overhang_len = options.overhang_len

        # Genes to run on from GFF
        gene_ids = options.compute_gene_psi[0].split(",")

        # GFF filename describing genes
        gff_filename = os.path.abspath(os.path.expanduser(options.compute_gene_psi[1]))

        # BAM filename with reads
        bam_filename = os.path.abspath(os.path.expanduser(options.compute_gene_psi[2]))

        # Output directory
        output_dir = os.path.abspath(os.path.expanduser(options.compute_gene_psi[3]))

        compute_gene_psi(gene_ids, gff_filename, bam_filename, output_dir,
                         options.read_len, overhang_len, paired_end=paired_end,
                         event_type=options.event_type)


    ##
    ## Summarizing samples
    ##
    if options.summarize_samples:
	samples_dir = os.path.abspath(os.path.expanduser(options.summarize_samples[0]))
	samples_label = os.path.basename(os.path.expanduser(samples_dir))
	assert(len(samples_label) >= 1)
	summary_output_dir = os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]),
							  'summary'))
	if not os.path.isdir(summary_output_dir):
	    os.makedirs(summary_output_dir)
	    
	summary_filename = os.path.join(summary_output_dir,
					'%s.miso_summary' %(samples_label))
	summarize_sampler_results(samples_dir, summary_filename)

    if options.view_gene != None:
        indexed_gene_filename = os.path.abspath(os.path.expanduser(options.view_gene))
        print "Viewing genes in %s" %(indexed_gene_filename)
        gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename)

        if gff_genes == None:
            print "No genes."
            return

        for gene_id, gene_info in gff_genes.iteritems():
            print "Gene %s" %(gene_id)
            gene_obj = gene_info['gene_object']
            print " - Gene object: ", gene_obj
            print "=="
            print "Isoforms: "
            for isoform in gene_obj.isoforms:
                print " - ", isoform
            print "=="
            print "Exons: "
            for exon in gene_obj.parts:
                print " - ", exon
Exemple #3
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir,
                     read_len, overhang_len, paired_end=None, event_type=None):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: no such GFF file as %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]

    min_event_reads = Settings.get_min_event_reads()

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)


    # Load the genes from the GFF
#    print "Loading genes from indexed GFF..."
#    t1 = time.time()
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
#    t2 = time.time()
#    print "  - Loading took: %.2f seconds" %(t2 - t1)
        
    for gene_id, gene_info in gff_genes.iteritems():
        if gene_id not in gene_ids:
            # Skip genes that we were not asked to run on
            continue

        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Find the most inclusive transcription start and end sites for each gene
        tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # If given a template for the SAM file, use it
        template = None
        
        if settings and "sam_template" in settings:
            template = settings["sam_template"]
        
        # Load the BAM file
        bamfile = sam_utils.load_bam_reads(bam_filename, template=template)

        # Fetch reads aligning to the gene boundaries
        gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom,
                                                       tx_start, tx_end,
                                                       gene_obj)

        # Align the reads to the isoforms
        reads = sam_utils.sam_reads_to_isoforms(gene_reads, gene_obj, read_len,
                                                overhang_len,
                                                paired_end=paired_end)

        num_raw_reads = len(reads)

        # Skip gene if none of the reads align to gene boundaries
        if num_raw_reads < min_event_reads:
            print "Only %d reads in gene, skipping (needed >= %d reads)" \
                  %(num_raw_reads, min_event_reads)
            continue

        reads = array(reads)
        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = miso.get_paired_end_sampler_params(num_isoforms,
                                                                mean_frag_len,
                                                                frag_variance,
                                                                read_len,
                                                                overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params, paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params, paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)
        if not os.path.isdir(chrom_dir):
            os.makedirs(chrom_dir)
            
        output_filename = os.path.join(chrom_dir, gene_obj.label)

        sampler.run_sampler(num_iters, reads, gene_obj,
                            hyperparameters, sampler_params,
                            output_filename, burn_in=burn_in,
                            lag=lag)
Exemple #4
0
def compute_gene_psi(gene_ids,
                     gff_index_filename,
                     bam_filename,
                     output_dir,
                     read_len,
                     overhang_len,
                     paired_end=None,
                     event_type=None):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." % (num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" % (gff_index_filename)
    print "  - BAM: %s" % (bam_filename)
    print "  - Outputting to: %s" % (output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()

    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]

    min_event_reads = Settings.get_min_event_reads()

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF


#    print "Loading genes from indexed GFF..."
#    t1 = time.time()
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    #    t2 = time.time()
    #    print "  - Loading took: %.2f seconds" %(t2 - t1)

    for gene_id, gene_info in gff_genes.iteritems():
        if gene_id not in gene_ids:
            # Skip genes that we were not asked to run on
            continue

        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Find the most inclusive transcription start and end sites for each gene
        tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(
            gene_info['hierarchy'][gene_id])

        # If given a template for the SAM file, use it
        template = None

        if settings and "sam_template" in settings:
            template = settings["sam_template"]

        # Load the BAM file
        bamfile = sam_utils.load_bam_reads(bam_filename, template=template)

        # Fetch reads aligning to the gene boundaries
        gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom,
                                                       tx_start, tx_end,
                                                       gene_obj)

        # Align the reads to the isoforms
        reads = sam_utils.sam_reads_to_isoforms(gene_reads,
                                                gene_obj,
                                                paired_end=paired_end)

        num_raw_reads = len(reads)

        # Skip gene if none of the reads align to gene boundaries
        if num_raw_reads < min_event_reads:
            print "Only %d reads in gene, skipping (needed >= %d reads)" \
                  %(num_raw_reads, min_event_reads)
            continue

        reads = array(reads)
        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = miso.get_paired_end_sampler_params(
                num_isoforms, mean_frag_len, frag_variance, read_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(
                num_isoforms, read_len, overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)
        if not os.path.isdir(chrom_dir):
            os.makedirs(chrom_dir)

        output_filename = os.path.join(chrom_dir, gene_obj.label)

        sampler.run_sampler(num_iters,
                            reads,
                            gene_obj,
                            hyperparameters,
                            sampler_params,
                            output_filename,
                            burn_in=burn_in,
                            lag=lag)