def get_ids_passing_filter(gff_index_dir,
                           bam_filename,
                           output_dir):
    """
    Apply filter to events using bedtools and return
    only the events that meet the filter.
    """
    min_reads = 20
    settings = Settings.get()
    min_event_reads = Settings.get_min_event_reads()
    
    # Check that this was indexed with a version that outputs
    # genes.gff file
    genes_gff_fname = os.path.join(gff_index_dir,
                                   "genes.gff")
    if not os.path.isfile(genes_gff_fname):
        print "WARNING: Could not find \'genes.gff\' in %s - " \
              "skipping prefilter stage. Please reindex your " \
              "GFF file with the latest version to enable " \
              "prefiltering." %(gff_index_dir)
        return None
    print "Prefiltering reads..."
    coverage_fname = exon_utils.get_bam_gff_coverage(bam_filename,
                                                     genes_gff_fname,
                                                     output_dir)
    ids_passing_filter = []
    with open(coverage_fname) as coverage_in:
        for line in coverage_in:
            # Skip comments
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            # Get the counts field and the event ID
            # if it passes the filter
            counts = int(fields[9])
            if counts < min_event_reads:
                continue
            attribs = gff_utils.parse_gff_attribs(fields[8])
            if "ID" not in attribs:
                print "WARNING: No ID= found for line:\n%s\nSkipping..." \
                    %(line)
                continue
            event_id = attribs["ID"]
            ids_passing_filter.append(event_id)
    return ids_passing_filter
Beispiel #2
0
def get_ids_passing_filter(gff_index_dir,
                           bam_filename,
                           output_dir):
    """
    Apply filter to events using bedtools and return
    only the events that meet the filter.
    """
    min_reads = 20
    settings = Settings.get()
    min_event_reads = Settings.get_min_event_reads()
    
    # Check that this was indexed with a version that outputs
    # genes.gff file
    genes_gff_fname = os.path.join(gff_index_dir,
                                   "genes.gff")
    if not os.path.isfile(genes_gff_fname):
        print "WARNING: Could not find \'genes.gff\' in %s - " \
              "skipping prefilter stage. Please reindex your " \
              "GFF file with the latest version to enable " \
              "prefiltering." %(gff_index_dir)
        return None
    print "Prefiltering reads..."
    coverage_fname = exon_utils.get_bam_gff_coverage(bam_filename,
                                                     genes_gff_fname,
                                                     output_dir)
    ids_passing_filter = []
    with open(coverage_fname) as coverage_in:
        for line in coverage_in:
            # Skip comments
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            # Get the counts field and the event ID
            # if it passes the filter
            counts = int(fields[9])
            if counts < min_event_reads:
                continue
            attribs = gff_utils.parse_gff_attribs(fields[8])
            if "ID" not in attribs:
                print "WARNING: No ID= found for line:\n%s\nSkipping..." \
                    %(line)
                continue
            event_id = attribs["ID"]
            ids_passing_filter.append(event_id)
    return ids_passing_filter
Beispiel #3
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)

    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]

    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)

    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue

        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand,
                                      given_read_len=read_len)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Beispiel #4
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              main_logger,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.main_logger = main_logger
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         self.main_logger.error("BAM file %s not found." %(self.bam_filename))
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         self.main_logger.warning("Expected BAM index file %s not found." \
                             %(self.bam_index_fname))
         self.main_logger.warning("Are you sure your BAM file is indexed?")
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         self.main_logger.info("Using %d processors" %(num_proc))
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         self.main_logger.error("No genes to run on. Did you pass me the wrong path " \
                                "to your index GFF directory? " \
                                "Or perhaps your indexed GFF directory " \
                                "is empty?")
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
Beispiel #5
0
def compute_psi(sample_filenames,
                output_dir,
                event_type,
                read_len,
                overhang_len,
                use_cluster=False,
                chunk_jobs=False,
                filter_events=True,
                events_info_filename=None,
                settings_filename=None):
    """
    Compute Psi values for skipped exons.  Sample filenames is a mapping from
    sample label to sample.

      - sample_filenames = [[sample_label1, sample_filename1],
                            [sample_label2, sample_filename2]]
      - output_dir: output directory
      - event_type: 'SE', 'RI', etc.
    """
    misc_utils.make_dir(output_dir)

    output_dir = os.path.join(output_dir, event_type)
    output_dir = os.path.abspath(output_dir)

    misc_utils.make_dir(output_dir)

    print "Computing Psi for events of type %s" % (event_type)
    print "  - samples used: ", sample_filenames.keys()

    for sample_label, sample_filename in sample_filenames.iteritems():
        print "Processing sample: label=%s, filename=%s" \
                   %(sample_label, sample_filename)
        results_output_dir = os.path.join(output_dir, sample_label)
        misc_utils.make_dir(results_output_dir)

        # Load the set of counts and serialize them into JSON
        events = \
                   as_events.load_event_counts(sample_filename,
                                               event_type,
                                               events_info_filename=events_info_filename)

        # Filter events
        if filter_events:
            print "Filtering events..."
            events.filter_events(settings=Settings.get())

        print "Running on a total of %d events." % (len(events.events))

        events_filename = events.output_file(results_output_dir, sample_label)

        # Run MISO on them
        miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \
                          "--read-len %d --overhang-len %d " \
                          %(os.path.join(miso_path, 'run_miso.py'),
                            events_filename,
                            results_output_dir,
                            event_type,
                            read_len,
                            overhang_len)
        if use_cluster:
            if chunk_jobs:
                miso_cmd += ' --use-cluster --chunk-jobs %d' % (chunk_jobs)
            else:
                miso_cmd += ' --use-cluster'
        print "Executing: %s" % (miso_cmd)
        if use_cluster:
            print " - Using cluster"
        os.system(miso_cmd)
Beispiel #6
0
 def __init__(self, gff_dir, bam_filename,
              output_dir, read_len, overhang_len,
              settings_fname=None,
              paired_end=None,
              use_cluster=False,
              chunk_jobs=200,
              SGEarray=False,
              sge_job_name="misojob",
              gene_ids=None,
              num_proc=None,
              wait_on_jobs=True):
     self.threads = {}
     self.gff_dir = gff_dir
     self.bam_filename = bam_filename
     # Check that the BAM filename exists and that it has an index
     if not os.path.isfile(self.bam_filename):
         print "Error: BAM file %s not found." %(self.bam_filename)
         sys.exit(1)
     self.bam_index_fname = "%s.bai" %(self.bam_filename)
     if not os.path.isfile(self.bam_index_fname):
         print "WARNING: Expected BAM index file %s not found." \
             %(self.bam_index_fname)
         print "Are you sure your BAM file is indexed?"
     self.output_dir = output_dir
     self.read_len = read_len
     # For now setting overhang to 1 always
     #self.overhang_len = overhang_len
     self.overhang_len = 1
     self.settings_fname = settings_fname
     self.paired_end = paired_end
     self.use_cluster = use_cluster
     self.chunk_jobs = chunk_jobs
     self.settings = Settings.get()
     self.cluster_cmd = Settings.get_cluster_command()
     self.sge_job_name = sge_job_name
     self.wait_on_jobs = wait_on_jobs
     # if chunk_jobs not given (i.e. set to False),
     # then set it to arbitrary value
     if not self.chunk_jobs:
         self.chunk_jobs = 200
     self.SGEarray = SGEarray
     self.num_processors = Settings.get_num_processors()
     if num_proc is not None:
         num_proc = int(num_proc)
         self.num_processors = num_proc
         print "Using %d processors" %(num_proc)
     self.long_thresh = 50
     self.batch_logs_dir = \
         os.path.join(output_dir, "batch-logs")
     self.batch_genes_dir = \
         os.path.join(output_dir, "batch-genes")
     self.cluster_scripts_dir = \
         os.path.join(output_dir, "cluster_scripts")
     self.scripts_output_dir = \
         os.path.join(output_dir, "scripts_output")
     misc_utils.make_dir(self.batch_logs_dir)
     misc_utils.make_dir(self.batch_genes_dir)
     misc_utils.make_dir(self.cluster_scripts_dir)
     misc_utils.make_dir(self.scripts_output_dir)
     # First compile a set of genes that should be run on
     # and output them to file along with their indexed
     # filenames
     self.gene_ids_to_gff_index = \
         gff_utils.get_gene_ids_to_gff_index(gff_dir)
     # If we're given filtered gene IDs, use them
     if gene_ids is not None:
         self.gene_ids = gene_ids
     else:
         self.gene_ids = self.gene_ids_to_gff_index.keys()
     if len(self.gene_ids) == 0:
         print "Error: No genes to run on. Did you pass me the wrong path " \
               "to your index GFF directory? " \
               "Or perhaps your indexed GFF directory " \
               "is empty?"
         sys.exit(1)
     self.batch_filenames = self.output_batch_files()
def compute_psi(sample_filenames, output_dir, event_type,
                read_len, overhang_len,
		use_cluster=False,
                chunk_jobs=False,
                filter_events=True,
                events_info_filename=None,
                settings_filename=None):
    """
    Compute Psi values for skipped exons.  Sample filenames is a mapping from
    sample label to sample.

      - sample_filenames = [[sample_label1, sample_filename1],
                            [sample_label2, sample_filename2]]
      - output_dir: output directory
      - event_type: 'SE', 'RI', etc.
    """
    misc_utils.make_dir(output_dir)
    
    output_dir = os.path.join(output_dir, event_type)
    output_dir = os.path.abspath(output_dir)

    misc_utils.make_dir(output_dir)
	
    print "Computing Psi for events of type %s" %(event_type)
    print "  - samples used: ", sample_filenames.keys()

    for sample_label, sample_filename in sample_filenames.iteritems():
	print "Processing sample: label=%s, filename=%s" \
            %(sample_label, sample_filename)
	results_output_dir = os.path.join(output_dir, sample_label)
        misc_utils.make_dir(results_output_dir)

	# Load the set of counts and serialize them into JSON
	events = \
            as_events.load_event_counts(sample_filename,
                                        event_type,
                                        events_info_filename=events_info_filename)

	# Filter events
	if filter_events:
	    print "Filtering events..."
	    events.filter_events(settings=Settings.get())

	print "Running on a total of %d events." %(len(events.events))
	    
	events_filename = events.output_file(results_output_dir,
                                             sample_label)
	
	# Run MISO on them
	miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \
                   "--read-len %d --overhang-len %d " \
                   %(os.path.join(miso_path, 'run_miso.py'),
                     events_filename,
                     results_output_dir,
                     event_type,
                     read_len,
                     overhang_len)
	if use_cluster:
	    if chunk_jobs:
		miso_cmd += ' --use-cluster --chunk-jobs %d' %(chunk_jobs)
	    else:
		miso_cmd += ' --use-cluster'
        print "Executing: %s" %(miso_cmd)
	if use_cluster:
	    print " - Using cluster"
	os.system(miso_cmd)
Beispiel #8
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)
        
    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return
    
    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    
    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]
        
    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)
    
    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue
        
        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)