Example #1
0
def count_total_reads(bam_filename):
    """
    Return total number of proper reads in BAM file.
    """
    bamfile = sam_utils.load_bam_reads(bam_filename)
    num_total_reads = 0

    for r in bamfile:
        num_total_reads += 1

    return num_total_reads
Example #2
0
def count_total_reads(bam_filename):
    """
    Return total number of proper reads in BAM file.
    """
    bamfile = sam_utils.load_bam_reads(bam_filename)
    num_total_reads = 0

    for r in bamfile:
        num_total_reads += 1

    return num_total_reads
Example #3
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)

    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]

    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)

    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue

        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand,
                                      given_read_len=read_len)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Example #4
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename,
                     output_dir, read_len, overhang_len,
                     paired_end=None,
                     event_type=None,
                     verbose=True):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    misc_utils.make_dir(output_dir)
        
    if not os.path.exists(gff_index_filename):
        print "Error: No GFF %s" %(gff_index_filename)
        return
    
    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]
    num_chains = settings_params["num_chains"]

    min_event_reads = Settings.get_min_event_reads()
    strand_rule = Settings.get_strand_param()

    mean_frag_len = None
    frag_variance = None

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    
    # If given a template for the SAM file, use it
    template = None

    if settings and "sam_template" in settings:
        template = settings["sam_template"]

    if "filter_reads" not in settings:
        filter_reads = True
    else:
        filter_reads = settings["filter_reads"]
        
    # Load the BAM file upfront
    bamfile = sam_utils.load_bam_reads(bam_filename,
                                       template=template)
    # Check if we're in compressed mode
    compressed_mode = misc_utils.is_compressed_index(gff_index_filename)
    
    for gene_id, gene_info in gff_genes.iteritems():
        lookup_id = gene_id
        # Skip genes that we were not asked to run on
        if lookup_id not in gene_ids:
            continue
        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Sanity check: if the isoforms are all shorter than the read,
        # skip the event
        if all(map(lambda l: l < read_len, gene_obj.iso_lens)):
            print "All isoforms of %s shorter than %d, so skipping" \
                  %(gene_id, read_len)
            continue
        
        # Find the most inclusive transcription start and end sites
        # for each gene
        tx_start, tx_end = \
            gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # Fetch reads aligning to the gene boundaries
        gene_reads = \
            sam_utils.fetch_bam_reads_in_gene(bamfile,
                                              gene_obj.chrom,
                                              tx_start,
                                              tx_end,
                                              gene_obj)
        # Parse reads: checking strandedness and pairing
        # reads in case of paired-end data
        reads, num_raw_reads = \
            sam_utils.sam_parse_reads(gene_reads,
                                      paired_end=paired_end,
                                      strand_rule=strand_rule,
                                      target_strand=gene_obj.strand)
        # Skip gene if none of the reads align to gene boundaries
        if filter_reads:
            if num_raw_reads < min_event_reads:
                print "Only %d reads in gene, skipping (needed >= %d reads)" \
                      %(num_raw_reads,
                        min_event_reads)
                continue
            else:
                print "%d raw reads in event" %(num_raw_reads)

        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = \
                miso.get_paired_end_sampler_params(num_isoforms,
                                                   mean_frag_len,
                                                   frag_variance,
                                                   read_len,
                                                   overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)

        try:
            os.makedirs(chrom_dir)
        except OSError:
            pass

        # Pick .miso output filename based on the pickle filename
        miso_basename = os.path.basename(gff_index_filename)
        if not miso_basename.endswith(".pickle"):
            print "Error: Invalid index file %s" %(gff_index_filename)
            sys.exit(1)
        miso_basename = miso_basename.replace(".pickle", "")
        output_filename = os.path.join(chrom_dir, "%s" %(miso_basename))
        sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters,
                            sampler_params, output_filename,
                            num_chains=num_chains,
                            burn_in=burn_in,
                            lag=lag)
Example #5
0
def compute_rpkm(gff_filename, bam_filename, read_len, output_dir):
    """
    Compute RPKMs for genes listed in GFF based on BAM reads.
    """
    print "Computing RPKMs..."
    print "  - GFF filename: %s" % (gff_filename)
    print "  - BAM filename: %s" % (bam_filename)
    print "  - Output dir: %s" % (output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_filename = os.path.join(
        output_dir, "%s.rpkm" % (os.path.basename(bam_filename)))
    print "Outputting RPKMs to: %s" % (output_filename)

    rpkm_fieldnames = ['gene_id', 'rpkm', 'const_exon_lens', 'num_reads']

    # Parse the GFF into genes
    print "Parsing GFF into genes..."
    t1 = time.time()
    gff_genes = load_genes_from_gff(gff_filename)
    t2 = time.time()
    print "Parsing took %.2f seconds" % (t2 - t1)

    # Load the BAM file
    bamfile = sam_utils.load_bam_reads(bam_filename)

    print "Counting all reads..."
    t1 = time.time()
    num_total_reads = count_total_reads(bam_filename)
    t2 = time.time()
    print "Took: %.2f seconds" % (t2 - t1)

    print "Number of total reads in BAM file: %d" % (num_total_reads)

    num_genes = 0

    rpkms_dictlist = []

    exons_too_small = {}
    num_no_const = 0

    for gene_id, gene_info in gff_genes.iteritems():
        # Get the gene object
        gene = gene_info['gene_object']

        # Get constitutive exons
        const_exons = gene.get_const_parts()

        num_reads = []
        exon_lens = []

        regions_counted = {}

        if not gene.chrom.startswith("chr"):
            chrom = "chr%s" % (gene.chrom)
        else:
            chrom = gene.chrom

        if "random" in chrom:
            print "Skipping random chromosome gene: %s, %s" \
                  %(gene_id, chrom)
            continue

        if len(const_exons) == 0:
            print "Gene %s has no constitutive regions!" % (gene_id)
            num_no_const += 1
            continue

        total_num_reads = 0

        for exon in const_exons:
            exon_len = exon.end - exon.start + 1

            counts = 0

            try:
                reads = bamfile.fetch(chrom, exon.start, exon.end)
            except ValueError:
                print "Error fetching region: %s:%d-%d" % (chrom, exon.start,
                                                           exon.end)
                break

            # Count reads landing in exon
            for r in reads:
                counts += 1

            total_num_reads += counts

            # Skip exons that we've seen already or exons that are shorter
            # than the read length
            if (exon.start, exon.end) in regions_counted or \
               exon_len < read_len:
                continue

            exon_lens.append(exon_len)

            num_reads.append(counts)

            regions_counted[(exon.start, exon.end)] = True

        if len(regions_counted) == 0:
            #            print "Gene %s exons are too small for %d-long reads" \
            #                  %(gene_id, read_len)
            exons_too_small[gene_id] = total_num_reads
            continue


#        print "Used total of %d regions" %(len(regions_counted))
#        print "Total of %d regions are too small" %(num_too_small)

        rpkm = rpkm_per_region(exon_lens, num_reads, read_len, num_total_reads)

        #        print rpkm, exon_lens, num_reads, read_len

        # Convert region lengths and number of reads to strings
        exon_lens_str = ",".join([str(e) for e in exon_lens])
        num_reads_str = ",".join([str(n) for n in num_reads])

        rpkm_entry = {
            'gene_id': gene_id,
            'rpkm': "%.2f" % (rpkm),
            'const_exon_lens': exon_lens_str,
            'num_reads': num_reads_str
        }
        rpkms_dictlist.append(rpkm_entry)

        #        print "RPKM: %.2f" %(rpkm)

        # Compute how many reads land in each constitutive exon
        num_genes += 1

    num_too_small = len(exons_too_small.keys())

    print "Computed RPKMs for %d genes." % (num_genes)
    print "  - Total of %d genes cannot be used because they lack const. regions." \
          %(num_no_const)
    print "  - Total of %d genes cannot be used since their exons are too small." \
          %(num_too_small)
    for gene, total_counts in exons_too_small.iteritems():
        print "      gene_id\ttotal_counts"
        print "    * %s\t%d" % (gene, total_counts)

    # Output RPKMs to file
    dictlist2file(rpkms_dictlist, output_filename, rpkm_fieldnames)

    return rpkms_dictlist
Example #6
0
def compute_rpkm(gff_filename, bam_filename, read_len,
                 output_dir):
    """
    Compute RPKMs for genes listed in GFF based on BAM reads.
    """
    print "Computing RPKMs..."
    print "  - GFF filename: %s" %(gff_filename)
    print "  - BAM filename: %s" %(bam_filename)
    print "  - Output dir: %s" %(output_dir)
    
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_filename = os.path.join(output_dir,
                                   "%s.rpkm" %(os.path.basename(bam_filename)))
    print "Outputting RPKMs to: %s" %(output_filename)

    rpkm_fieldnames = ['gene_id', 'rpkm', 'const_exon_lens',
                       'num_reads']

    # Parse the GFF into genes
    print "Parsing GFF into genes..."
    t1 = time.time()
    gff_genes = load_genes_from_gff(gff_filename)
    t2 = time.time()
    print "Parsing took %.2f seconds" %(t2 - t1)

    # Load the BAM file
    bamfile = sam_utils.load_bam_reads(bam_filename)

    print "Counting all reads..."
    t1 = time.time()
    num_total_reads = count_total_reads(bam_filename)
    t2 = time.time()
    print "Took: %.2f seconds" %(t2 - t1)

    print "Number of total reads in BAM file: %d" %(num_total_reads)

    num_genes = 0

    rpkms_dictlist = []

    exons_too_small = {}
    num_no_const = 0

    for gene_id, gene_info in gff_genes.iteritems():
        # Get the gene object
        gene = gene_info['gene_object']

        # Get constitutive exons
        const_exons = gene.get_const_parts()

        num_reads = []
        exon_lens = []

        regions_counted = {}

        if not gene.chrom.startswith("chr"):
            chrom = "chr%s" %(gene.chrom)
        else:
            chrom = gene.chrom

        if "random" in chrom:
            print "Skipping random chromosome gene: %s, %s" \
                  %(gene_id, chrom)
            continue

        if len(const_exons) == 0:
            print "Gene %s has no constitutive regions!" %(gene_id)
            num_no_const += 1
            continue

        total_num_reads = 0

        for exon in const_exons:
            exon_len = exon.end - exon.start + 1

            counts = 0

            try:
                reads = bamfile.fetch(chrom, exon.start, exon.end)
            except ValueError:
                print "Error fetching region: %s:%d-%d" %(chrom,
                                                          exon.start,
                                                          exon.end)
                break
            
            # Count reads landing in exon
            for r in reads: counts += 1

            total_num_reads += counts

            # Skip exons that we've seen already or exons that are shorter
            # than the read length
            if (exon.start, exon.end) in regions_counted or \
               exon_len < read_len:
                continue
            
            exon_lens.append(exon_len)
            
            num_reads.append(counts)

            regions_counted[(exon.start, exon.end)] = True

        if len(regions_counted) == 0:
#            print "Gene %s exons are too small for %d-long reads" \
#                  %(gene_id, read_len)
            exons_too_small[gene_id] = total_num_reads
            continue

#        print "Used total of %d regions" %(len(regions_counted))
#        print "Total of %d regions are too small" %(num_too_small)

        rpkm = rpkm_per_region(exon_lens, num_reads, read_len,
                               num_total_reads)
        
#        print rpkm, exon_lens, num_reads, read_len      

        # Convert region lengths and number of reads to strings
        exon_lens_str = ",".join([str(e) for e in exon_lens])
        num_reads_str = ",".join([str(n) for n in num_reads])

        rpkm_entry = {'gene_id': gene_id,
                      'rpkm': "%.2f" %(rpkm),
                      'const_exon_lens': exon_lens_str,
                      'num_reads': num_reads_str}
        rpkms_dictlist.append(rpkm_entry)
        
#        print "RPKM: %.2f" %(rpkm)
            
        # Compute how many reads land in each constitutive exon
        num_genes += 1

    num_too_small = len(exons_too_small.keys())

    print "Computed RPKMs for %d genes." %(num_genes)
    print "  - Total of %d genes cannot be used because they lack const. regions." \
          %(num_no_const)
    print "  - Total of %d genes cannot be used since their exons are too small." \
          %(num_too_small)
    for gene, total_counts in exons_too_small.iteritems():
        print "      gene_id\ttotal_counts"
        print "    * %s\t%d" %(gene, total_counts)

    # Output RPKMs to file
    dictlist2file(rpkms_dictlist, output_filename,
                  rpkm_fieldnames)

    return rpkms_dictlist