def count_total_reads(bam_filename): """ Return total number of proper reads in BAM file. """ bamfile = sam_utils.load_bam_reads(bam_filename) num_total_reads = 0 for r in bamfile: num_total_reads += 1 return num_total_reads
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ if not os.path.isdir(output_dir): os.makedirs(output_dir) if not os.path.exists(gff_index_filename): print "Error: no such GFF file as %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] min_event_reads = Settings.get_min_event_reads() if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF # print "Loading genes from indexed GFF..." # t1 = time.time() gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # t2 = time.time() # print " - Loading took: %.2f seconds" %(t2 - t1) for gene_id, gene_info in gff_genes.iteritems(): if gene_id not in gene_ids: # Skip genes that we were not asked to run on continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Find the most inclusive transcription start and end sites for each gene tx_start, tx_end = gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] # Load the BAM file bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Fetch reads aligning to the gene boundaries gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Align the reads to the isoforms reads = sam_utils.sam_reads_to_isoforms(gene_reads, gene_obj, read_len, overhang_len, paired_end=paired_end) num_raw_reads = len(reads) # Skip gene if none of the reads align to gene boundaries if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue reads = array(reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) if not os.path.isdir(chrom_dir): os.makedirs(chrom_dir) output_filename = os.path.join(chrom_dir, gene_obj.label) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, burn_in=burn_in, lag=lag)
def compute_insert_len(bam_filename, gff_filename, output_dir, min_exon_size): """ Compute insert length distribution and output it to the given directory. """ print "Computing insert length distribution of %s" %(bam_filename) print " - Using gene models from: %s" %(gff_filename) print " - Outputting to: %s" %(output_dir) print " - Minimum exon size used: %d" %(min_exon_size) if not os.path.isdir(output_dir): print "Making directory: %s" %(output_dir) os.makedirs(output_dir) output_filename = os.path.join(output_dir, "%s.insert_len" %(os.path.basename(bam_filename))) # Load BAM file with reads bamfile = sam_utils.load_bam_reads(bam_filename) # Load the genes from the GFF print "Loading genes from GFF..." t1 = time.time() gff_genes = gene_utils.load_genes_from_gff(gff_filename) t2 = time.time() print " - Loading genes from GFF took %.2f seconds" %(t2 - t1) insert_lengths = [] t1 = time.time() relevant_region = 0 for gene_id, gene_info in gff_genes.iteritems(): gene_obj = gene_info["gene_object"] # Get all the constitutive parts const_parts = gene_obj.get_const_parts() chrom = gene_obj.chrom # Consider only the large constitutive parts for part in const_parts: if part.len >= min_exon_size: # Get all the reads that land in the coordinates of the exon try: exon_reads = bamfile.fetch(chrom, part.start, part.end) except ValueError: print "Could not fetch from region: ", chrom, part.start, part.end continue # Pair all the paired-end reads that land there paired_reads = sam_utils.pair_sam_reads(exon_reads) num_paired_reads = len(paired_reads) if num_paired_reads == 0: continue print "Found %d region" %(relevant_region) relevant_region += 1 # Compute the insert length of each read for read_pair_id, read_pair in paired_reads.iteritems(): if len(read_pair) != 2: # Skip non-paired reads continue left_read, right_read = read_pair insert_len = right_read.pos - left_read.pos + 1 if insert_len > 0: insert_lengths.append(insert_len) else: print "Negative or zero insert length ignored..." # Output results to file output_file = open(output_filename, 'w') insert_length_str = "\n".join(map(str, insert_lengths)) output_file.write(insert_length_str) output_file.close() t2 = time.time() print "Insert length computation took %.2f seconds." %(t2 - t1)
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ if not os.path.isdir(output_dir): os.makedirs(output_dir) num_genes = len(gene_ids) print "Computing Psi for %d genes..." % (num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" % (gff_index_filename) print " - BAM: %s" % (bam_filename) print " - Outputting to: %s" % (output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] min_event_reads = Settings.get_min_event_reads() if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF # print "Loading genes from indexed GFF..." # t1 = time.time() gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # t2 = time.time() # print " - Loading took: %.2f seconds" %(t2 - t1) for gene_id, gene_info in gff_genes.iteritems(): if gene_id not in gene_ids: # Skip genes that we were not asked to run on continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Find the most inclusive transcription start and end sites for each gene tx_start, tx_end = gff_utils.get_inclusive_txn_bounds( gene_info['hierarchy'][gene_id]) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] # Load the BAM file bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Fetch reads aligning to the gene boundaries gene_reads = sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Align the reads to the isoforms reads = sam_utils.sam_reads_to_isoforms(gene_reads, gene_obj, paired_end=paired_end) num_raw_reads = len(reads) # Skip gene if none of the reads align to gene boundaries if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue reads = array(reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = miso.get_paired_end_sampler_params( num_isoforms, mean_frag_len, frag_variance, read_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params( num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) if not os.path.isdir(chrom_dir): os.makedirs(chrom_dir) output_filename = os.path.join(chrom_dir, gene_obj.label) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, burn_in=burn_in, lag=lag)
def compute_rpkm(gff_filename, bam_filename, read_len, output_dir): """ Compute RPKMs for genes listed in GFF based on BAM reads. """ print "Computing RPKMs..." print " - GFF filename: %s" %(gff_filename) print " - BAM filename: %s" %(bam_filename) print " - Output dir: %s" %(output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) output_filename = os.path.join(output_dir, "%s.rpkm" %(os.path.basename(bam_filename))) print "Outputting RPKMs to: %s" %(output_filename) rpkm_fieldnames = ['gene_id', 'rpkm', 'const_exon_lens', 'num_reads'] # Parse the GFF into genes print "Parsing GFF into genes..." t1 = time.time() gff_genes = load_genes_from_gff(gff_filename) t2 = time.time() print "Parsing took %.2f seconds" %(t2 - t1) # Load the BAM file bamfile = sam_utils.load_bam_reads(bam_filename) print "Counting all reads..." t1 = time.time() num_total_reads = count_total_reads(bam_filename) t2 = time.time() print "Took: %.2f seconds" %(t2 - t1) print "Number of total reads in BAM file: %d" %(num_total_reads) num_genes = 0 rpkms_dictlist = [] exons_too_small = {} num_no_const = 0 for gene_id, gene_info in gff_genes.iteritems(): # Get the gene object gene = gene_info['gene_object'] # Get constitutive exons const_exons = gene.get_const_parts() num_reads = [] exon_lens = [] regions_counted = {} if not gene.chrom.startswith("chr"): chrom = "chr%s" %(gene.chrom) else: chrom = gene.chrom if "random" in chrom: print "Skipping random chromosome gene: %s, %s" \ %(gene_id, chrom) continue if len(const_exons) == 0: print "Gene %s has no constitutive regions!" %(gene_id) num_no_const += 1 continue total_num_reads = 0 for exon in const_exons: exon_len = exon.end - exon.start + 1 counts = 0 try: reads = bamfile.fetch(chrom, exon.start, exon.end) except ValueError: print "Error fetching region: %s:%d-%d" %(chrom, exon.start, exon.end) break # Count reads landing in exon for r in reads: counts += 1 total_num_reads += counts # Skip exons that we've seen already or exons that are shorter # than the read length if (exon.start, exon.end) in regions_counted or \ exon_len < read_len: continue exon_lens.append(exon_len) num_reads.append(counts) regions_counted[(exon.start, exon.end)] = True if len(regions_counted) == 0: # print "Gene %s exons are too small for %d-long reads" \ # %(gene_id, read_len) exons_too_small[gene_id] = total_num_reads continue # print "Used total of %d regions" %(len(regions_counted)) # print "Total of %d regions are too small" %(num_too_small) rpkm = rpkm_per_region(exon_lens, num_reads, read_len, num_total_reads) # print rpkm, exon_lens, num_reads, read_len # Convert region lengths and number of reads to strings exon_lens_str = ",".join([str(e) for e in exon_lens]) num_reads_str = ",".join([str(n) for n in num_reads]) rpkm_entry = {'gene_id': gene_id, 'rpkm': "%.2f" %(rpkm), 'const_exon_lens': exon_lens_str, 'num_reads': num_reads_str} rpkms_dictlist.append(rpkm_entry) # print "RPKM: %.2f" %(rpkm) # Compute how many reads land in each constitutive exon num_genes += 1 num_too_small = len(exons_too_small.keys()) print "Computed RPKMs for %d genes." %(num_genes) print " - Total of %d genes cannot be used because they lack const. regions." \ %(num_no_const) print " - Total of %d genes cannot be used since their exons are too small." \ %(num_too_small) for gene, total_counts in exons_too_small.iteritems(): print " gene_id\ttotal_counts" print " * %s\t%d" %(gene, total_counts) # Output RPKMs to file dictlist2file(rpkms_dictlist, output_filename, rpkm_fieldnames) return rpkms_dictlist
def compute_insert_len(bam_filename, gff_filename, output_dir, min_exon_size): """ Compute insert length distribution and output it to the given directory. """ print "Computing insert length distribution of %s" % (bam_filename) print " - Using gene models from: %s" % (gff_filename) print " - Outputting to: %s" % (output_dir) print " - Minimum exon size used: %d" % (min_exon_size) if not os.path.isdir(output_dir): print "Making directory: %s" % (output_dir) os.makedirs(output_dir) output_filename = os.path.join( output_dir, "%s.insert_len" % (os.path.basename(bam_filename))) # Load BAM file with reads bamfile = sam_utils.load_bam_reads(bam_filename) # Load the genes from the GFF print "Loading genes from GFF..." t1 = time.time() gff_genes = gene_utils.load_genes_from_gff(gff_filename) t2 = time.time() print " - Loading genes from GFF took %.2f seconds" % (t2 - t1) insert_lengths = [] t1 = time.time() relevant_region = 0 for gene_id, gene_info in gff_genes.iteritems(): gene_obj = gene_info["gene_object"] # Get all the constitutive parts const_parts = gene_obj.get_const_parts() chrom = gene_obj.chrom # Consider only the large constitutive parts for part in const_parts: if part.len >= min_exon_size: # Get all the reads that land in the coordinates of the exon try: exon_reads = bamfile.fetch(chrom, part.start, part.end) except ValueError: print "Could not fetch from region: ", chrom, part.start, part.end continue # Pair all the paired-end reads that land there paired_reads = sam_utils.pair_sam_reads(exon_reads) num_paired_reads = len(paired_reads) if num_paired_reads == 0: continue print "Found %d region" % (relevant_region) relevant_region += 1 # Compute the insert length of each read for read_pair_id, read_pair in paired_reads.iteritems(): if len(read_pair) != 2: # Skip non-paired reads continue left_read, right_read = read_pair insert_len = right_read.pos - left_read.pos + 1 if insert_len > 0: insert_lengths.append(insert_len) else: print "Negative or zero insert length ignored..." # Output results to file output_file = open(output_filename, 'w') insert_length_str = "\n".join(map(str, insert_lengths)) output_file.write(insert_length_str) output_file.close() t2 = time.time() print "Insert length computation took %.2f seconds." % (t2 - t1)