Exemple #1
0
def count_total_reads(bam_filename):
    """
    Return total number of proper reads in BAM file.
    """
    bamfile = sam_utils.load_bam_reads(bam_filename)
    num_total_reads = 0

    for r in bamfile:
        num_total_reads += 1

    return num_total_reads
Exemple #2
0
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir,
                     read_len, overhang_len, paired_end=None, event_type=None):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if not os.path.exists(gff_index_filename):
        print "Error: no such GFF file as %s" %(gff_index_filename)
        return

    num_genes = len(gene_ids)
    
    print "Computing Psi for %d genes..." %(num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" %(gff_index_filename)
    print "  - BAM: %s" %(bam_filename)
    print "  - Outputting to: %s" %(output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()
    
    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]

    min_event_reads = Settings.get_min_event_reads()

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)


    # Load the genes from the GFF
#    print "Loading genes from indexed GFF..."
#    t1 = time.time()
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
#    t2 = time.time()
#    print "  - Loading took: %.2f seconds" %(t2 - t1)
        
    for gene_id, gene_info in gff_genes.iteritems():
        if gene_id not in gene_ids:
            # Skip genes that we were not asked to run on
            continue

        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Find the most inclusive transcription start and end sites for each gene
        tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id])

        # If given a template for the SAM file, use it
        template = None
        
        if settings and "sam_template" in settings:
            template = settings["sam_template"]
        
        # Load the BAM file
        bamfile = sam_utils.load_bam_reads(bam_filename, template=template)

        # Fetch reads aligning to the gene boundaries
        gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom,
                                                       tx_start, tx_end,
                                                       gene_obj)

        # Align the reads to the isoforms
        reads = sam_utils.sam_reads_to_isoforms(gene_reads, gene_obj, read_len,
                                                overhang_len,
                                                paired_end=paired_end)

        num_raw_reads = len(reads)

        # Skip gene if none of the reads align to gene boundaries
        if num_raw_reads < min_event_reads:
            print "Only %d reads in gene, skipping (needed >= %d reads)" \
                  %(num_raw_reads, min_event_reads)
            continue

        reads = array(reads)
        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = miso.get_paired_end_sampler_params(num_isoforms,
                                                                mean_frag_len,
                                                                frag_variance,
                                                                read_len,
                                                                overhang_len=overhang_len)
            sampler = miso.MISOSampler(sampler_params, paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(num_isoforms,
                                                                read_len,
                                                                overhang_len)
            sampler = miso.MISOSampler(sampler_params, paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)
        if not os.path.isdir(chrom_dir):
            os.makedirs(chrom_dir)
            
        output_filename = os.path.join(chrom_dir, gene_obj.label)

        sampler.run_sampler(num_iters, reads, gene_obj,
                            hyperparameters, sampler_params,
                            output_filename, burn_in=burn_in,
                            lag=lag)
Exemple #3
0
def compute_insert_len(bam_filename, gff_filename, output_dir,
                       min_exon_size):
    """
    Compute insert length distribution and output it to the given
    directory.
    """
    print "Computing insert length distribution of %s" %(bam_filename)
    print "  - Using gene models from: %s" %(gff_filename)
    print "  - Outputting to: %s" %(output_dir)
    print "  - Minimum exon size used: %d" %(min_exon_size)

    if not os.path.isdir(output_dir):
        print "Making directory: %s" %(output_dir)
        os.makedirs(output_dir)

    output_filename = os.path.join(output_dir,
                                   "%s.insert_len" %(os.path.basename(bam_filename)))

    # Load BAM file with reads
    bamfile = sam_utils.load_bam_reads(bam_filename)
    
    # Load the genes from the GFF
    print "Loading genes from GFF..."
    t1 = time.time()
    gff_genes = gene_utils.load_genes_from_gff(gff_filename)
    t2 = time.time()
    print "  - Loading genes from GFF took %.2f seconds" %(t2 - t1)

    insert_lengths = []

    t1 = time.time()

    relevant_region = 0
    
    for gene_id, gene_info in gff_genes.iteritems():
        gene_obj = gene_info["gene_object"]

        # Get all the constitutive parts
        const_parts = gene_obj.get_const_parts()

        chrom = gene_obj.chrom

        # Consider only the large constitutive parts
        for part in const_parts:
            if part.len >= min_exon_size:
                # Get all the reads that land in the coordinates of the exon
                try:
                    exon_reads = bamfile.fetch(chrom, part.start, part.end)
                except ValueError:
                    print "Could not fetch from region: ", chrom, part.start, part.end
                    continue

                # Pair all the paired-end reads that land there
                paired_reads = sam_utils.pair_sam_reads(exon_reads)
                num_paired_reads = len(paired_reads)

                if num_paired_reads == 0:
                    continue

                print "Found %d region" %(relevant_region)
                relevant_region += 1

                # Compute the insert length of each read
                for read_pair_id, read_pair in paired_reads.iteritems():
                    if len(read_pair) != 2:
                        # Skip non-paired reads
                        continue
                    
                    left_read, right_read = read_pair
                    insert_len = right_read.pos - left_read.pos + 1

                    if insert_len > 0:
                        insert_lengths.append(insert_len)
                    else:
                        print "Negative or zero insert length ignored..."

    # Output results to file
    output_file = open(output_filename, 'w')
    insert_length_str = "\n".join(map(str, insert_lengths))
    output_file.write(insert_length_str)
    output_file.close()
                    
    t2 = time.time()
    print "Insert length computation took %.2f seconds." %(t2 - t1)
Exemple #4
0
def compute_gene_psi(gene_ids,
                     gff_index_filename,
                     bam_filename,
                     output_dir,
                     read_len,
                     overhang_len,
                     paired_end=None,
                     event_type=None):
    """
    Run Psi at the Gene-level (for multi-isoform inference.)

    Arguments:

    - Set of gene IDs corresponding to gene IDs from the GFF
    - Indexed GFF filename describing the genes
    - BAM filename with the reads (must be sorted and indexed)
    - Output directory
    - Optional: Run in paired-end mode. Gives mean and standard deviation
      of fragment length distribution.
    """
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    num_genes = len(gene_ids)

    print "Computing Psi for %d genes..." % (num_genes)
    print "  - " + ", ".join(gene_ids)
    print "  - GFF filename: %s" % (gff_index_filename)
    print "  - BAM: %s" % (bam_filename)
    print "  - Outputting to: %s" % (output_dir)

    if paired_end:
        print "  - Paired-end mode: ", paired_end

    settings = Settings.get()
    settings_params = Settings.get_sampler_params()

    burn_in = settings_params["burn_in"]
    lag = settings_params["lag"]
    num_iters = settings_params["num_iters"]

    min_event_reads = Settings.get_min_event_reads()

    if paired_end:
        mean_frag_len = int(paired_end[0])
        frag_variance = power(int(paired_end[1]), 2)

    # Load the genes from the GFF


#    print "Loading genes from indexed GFF..."
#    t1 = time.time()
    gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename)
    #    t2 = time.time()
    #    print "  - Loading took: %.2f seconds" %(t2 - t1)

    for gene_id, gene_info in gff_genes.iteritems():
        if gene_id not in gene_ids:
            # Skip genes that we were not asked to run on
            continue

        gene_obj = gene_info['gene_object']
        gene_hierarchy = gene_info['hierarchy']

        # Find the most inclusive transcription start and end sites for each gene
        tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(
            gene_info['hierarchy'][gene_id])

        # If given a template for the SAM file, use it
        template = None

        if settings and "sam_template" in settings:
            template = settings["sam_template"]

        # Load the BAM file
        bamfile = sam_utils.load_bam_reads(bam_filename, template=template)

        # Fetch reads aligning to the gene boundaries
        gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom,
                                                       tx_start, tx_end,
                                                       gene_obj)

        # Align the reads to the isoforms
        reads = sam_utils.sam_reads_to_isoforms(gene_reads,
                                                gene_obj,
                                                paired_end=paired_end)

        num_raw_reads = len(reads)

        # Skip gene if none of the reads align to gene boundaries
        if num_raw_reads < min_event_reads:
            print "Only %d reads in gene, skipping (needed >= %d reads)" \
                  %(num_raw_reads, min_event_reads)
            continue

        reads = array(reads)
        num_isoforms = len(gene_obj.isoforms)
        hyperparameters = ones(num_isoforms)

        ##
        ## Run the sampler
        ##
        # Create the sampler with the right parameters depending on whether
        # this is a paired-end or single-end data set.
        if paired_end:
            # Sampler parameters for paired-end mode
            sampler_params = miso.get_paired_end_sampler_params(
                num_isoforms, mean_frag_len, frag_variance, read_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=True,
                                       log_dir=output_dir)

        else:
            # Sampler parameters for single-end mode
            sampler_params = miso.get_single_end_sampler_params(
                num_isoforms, read_len, overhang_len)
            sampler = miso.MISOSampler(sampler_params,
                                       paired_end=False,
                                       log_dir=output_dir)

        # Make directory for chromosome -- if given an event type, put
        # the gene in the event type directory
        if event_type != None:
            chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom)
        else:
            chrom_dir = os.path.join(output_dir, gene_obj.chrom)
        if not os.path.isdir(chrom_dir):
            os.makedirs(chrom_dir)

        output_filename = os.path.join(chrom_dir, gene_obj.label)

        sampler.run_sampler(num_iters,
                            reads,
                            gene_obj,
                            hyperparameters,
                            sampler_params,
                            output_filename,
                            burn_in=burn_in,
                            lag=lag)
Exemple #5
0
def compute_rpkm(gff_filename, bam_filename, read_len,
                 output_dir):
    """
    Compute RPKMs for genes listed in GFF based on BAM reads.
    """
    print "Computing RPKMs..."
    print "  - GFF filename: %s" %(gff_filename)
    print "  - BAM filename: %s" %(bam_filename)
    print "  - Output dir: %s" %(output_dir)
    
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_filename = os.path.join(output_dir,
                                   "%s.rpkm" %(os.path.basename(bam_filename)))
    print "Outputting RPKMs to: %s" %(output_filename)

    rpkm_fieldnames = ['gene_id', 'rpkm', 'const_exon_lens',
                       'num_reads']

    # Parse the GFF into genes
    print "Parsing GFF into genes..."
    t1 = time.time()
    gff_genes = load_genes_from_gff(gff_filename)
    t2 = time.time()
    print "Parsing took %.2f seconds" %(t2 - t1)

    # Load the BAM file
    bamfile = sam_utils.load_bam_reads(bam_filename)

    print "Counting all reads..."
    t1 = time.time()
    num_total_reads = count_total_reads(bam_filename)
    t2 = time.time()
    print "Took: %.2f seconds" %(t2 - t1)

    print "Number of total reads in BAM file: %d" %(num_total_reads)

    num_genes = 0

    rpkms_dictlist = []

    exons_too_small = {}
    num_no_const = 0

    for gene_id, gene_info in gff_genes.iteritems():
        # Get the gene object
        gene = gene_info['gene_object']

        # Get constitutive exons
        const_exons = gene.get_const_parts()

        num_reads = []
        exon_lens = []

        regions_counted = {}

        if not gene.chrom.startswith("chr"):
            chrom = "chr%s" %(gene.chrom)
        else:
            chrom = gene.chrom

        if "random" in chrom:
            print "Skipping random chromosome gene: %s, %s" \
                  %(gene_id, chrom)
            continue

        if len(const_exons) == 0:
            print "Gene %s has no constitutive regions!" %(gene_id)
            num_no_const += 1
            continue

        total_num_reads = 0

        for exon in const_exons:
            exon_len = exon.end - exon.start + 1

            counts = 0

            try:
                reads = bamfile.fetch(chrom, exon.start, exon.end)
            except ValueError:
                print "Error fetching region: %s:%d-%d" %(chrom,
                                                          exon.start,
                                                          exon.end)
                break
            
            # Count reads landing in exon
            for r in reads: counts += 1

            total_num_reads += counts

            # Skip exons that we've seen already or exons that are shorter
            # than the read length
            if (exon.start, exon.end) in regions_counted or \
               exon_len < read_len:
                continue
            
            exon_lens.append(exon_len)
            
            num_reads.append(counts)

            regions_counted[(exon.start, exon.end)] = True

        if len(regions_counted) == 0:
#            print "Gene %s exons are too small for %d-long reads" \
#                  %(gene_id, read_len)
            exons_too_small[gene_id] = total_num_reads
            continue

#        print "Used total of %d regions" %(len(regions_counted))
#        print "Total of %d regions are too small" %(num_too_small)

        rpkm = rpkm_per_region(exon_lens, num_reads, read_len,
                               num_total_reads)
        
#        print rpkm, exon_lens, num_reads, read_len      

        # Convert region lengths and number of reads to strings
        exon_lens_str = ",".join([str(e) for e in exon_lens])
        num_reads_str = ",".join([str(n) for n in num_reads])

        rpkm_entry = {'gene_id': gene_id,
                      'rpkm': "%.2f" %(rpkm),
                      'const_exon_lens': exon_lens_str,
                      'num_reads': num_reads_str}
        rpkms_dictlist.append(rpkm_entry)
        
#        print "RPKM: %.2f" %(rpkm)
            
        # Compute how many reads land in each constitutive exon
        num_genes += 1

    num_too_small = len(exons_too_small.keys())

    print "Computed RPKMs for %d genes." %(num_genes)
    print "  - Total of %d genes cannot be used because they lack const. regions." \
          %(num_no_const)
    print "  - Total of %d genes cannot be used since their exons are too small." \
          %(num_too_small)
    for gene, total_counts in exons_too_small.iteritems():
        print "      gene_id\ttotal_counts"
        print "    * %s\t%d" %(gene, total_counts)

    # Output RPKMs to file
    dictlist2file(rpkms_dictlist, output_filename,
                  rpkm_fieldnames)

    return rpkms_dictlist
Exemple #6
0
def compute_insert_len(bam_filename, gff_filename, output_dir, min_exon_size):
    """
    Compute insert length distribution and output it to the given
    directory.
    """
    print "Computing insert length distribution of %s" % (bam_filename)
    print "  - Using gene models from: %s" % (gff_filename)
    print "  - Outputting to: %s" % (output_dir)
    print "  - Minimum exon size used: %d" % (min_exon_size)

    if not os.path.isdir(output_dir):
        print "Making directory: %s" % (output_dir)
        os.makedirs(output_dir)

    output_filename = os.path.join(
        output_dir, "%s.insert_len" % (os.path.basename(bam_filename)))

    # Load BAM file with reads
    bamfile = sam_utils.load_bam_reads(bam_filename)

    # Load the genes from the GFF
    print "Loading genes from GFF..."
    t1 = time.time()
    gff_genes = gene_utils.load_genes_from_gff(gff_filename)
    t2 = time.time()
    print "  - Loading genes from GFF took %.2f seconds" % (t2 - t1)

    insert_lengths = []

    t1 = time.time()

    relevant_region = 0

    for gene_id, gene_info in gff_genes.iteritems():
        gene_obj = gene_info["gene_object"]

        # Get all the constitutive parts
        const_parts = gene_obj.get_const_parts()

        chrom = gene_obj.chrom

        # Consider only the large constitutive parts
        for part in const_parts:
            if part.len >= min_exon_size:
                # Get all the reads that land in the coordinates of the exon
                try:
                    exon_reads = bamfile.fetch(chrom, part.start, part.end)
                except ValueError:
                    print "Could not fetch from region: ", chrom, part.start, part.end
                    continue

                # Pair all the paired-end reads that land there
                paired_reads = sam_utils.pair_sam_reads(exon_reads)
                num_paired_reads = len(paired_reads)

                if num_paired_reads == 0:
                    continue

                print "Found %d region" % (relevant_region)
                relevant_region += 1

                # Compute the insert length of each read
                for read_pair_id, read_pair in paired_reads.iteritems():
                    if len(read_pair) != 2:
                        # Skip non-paired reads
                        continue

                    left_read, right_read = read_pair
                    insert_len = right_read.pos - left_read.pos + 1

                    if insert_len > 0:
                        insert_lengths.append(insert_len)
                    else:
                        print "Negative or zero insert length ignored..."

    # Output results to file
    output_file = open(output_filename, 'w')
    insert_length_str = "\n".join(map(str, insert_lengths))
    output_file.write(insert_length_str)
    output_file.close()

    t2 = time.time()
    print "Insert length computation took %.2f seconds." % (t2 - t1)