def sanity_check_lifted_nagalakshmi_file(fn): gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf' CDSs = gtf.get_CDSs(gtf_fn) gtf_dict = {t.name: t for t in CDSs} genes = read_nagalakshmi_file(fn) discrepancies_after = defaultdict(list) for name in genes: if name not in gtf_dict: print name, 'not in gtf_dict' continue start, end = genes[name]['SGD_Start'], genes[name]['SGD_End'] chrom = genes[name]['Chrom'] if start > end: start, end = end, start end -= 1 if start != gtf_dict[name].start or end != gtf_dict[name].end: #print name, chrom #print start, end #print gtf_dict[name].start, gtf_dict[name].end #raw_input() discrepancies_after[chrom].append(name) return discrepancies_after
def sanity_check_nagalakshmi_file(): gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/SGD1.01/transcriptome/genes.gtf' CDSs = gtf.get_CDSs(gtf_fn) gtf_dict = {t.name: t for t in CDSs} genes = read_nagalakshmi_file('nagalakshmi_annotations.txt') discrepancies_after = {} for name in genes: if name not in gtf_dict: print name, 'not in gtf_dict' continue start, end = genes[name]['SGD_Start'] - 1, genes[name]['SGD_End'] - 1 chrom = genes[name]['Chrom'] if start > end: start, end = end, start if start != gtf_dict[name].start or end != gtf_dict[name].end: #print name, chrom #print start, end #print gtf_dict[name].start, gtf_dict[name].end #print if chrom not in discrepancies_after: discrepancies_after[chrom] = start else: discrepancies_after[chrom] = min(start, discrepancies_after[chrom]) return discrepancies_after
def sanity_check_weinberg_file(): gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf' CDSs = gtf.get_CDSs(gtf_fn) gtf_dict = {t.name: t for t in CDSs} genes = read_weinberg_file() discrepancies_after = defaultdict(list) for name in genes: if name not in gtf_dict: print name, 'not in gtf_dict' continue start, end = genes[name]['CdsStart'], genes[name]['CdsEnd'] chrom = genes[name]['Chromosome'] end -= 1 if start != gtf_dict[name].start or end != gtf_dict[name].end: print name, chrom print start, end print gtf_dict[name].start, gtf_dict[name].end raw_input() discrepancies_after[chrom].append(name) return discrepancies_after
def plot_frameshifts( gtf_fn, bam_fns, gene_name, exp_name, genome_dir, show_fractions=False, ): codon_buffer = 10 start_codon = 'ATG' stop_codons = {'TAA', 'TAG', 'TGA'} A_site_offset = 5 CDSs = {c.name: c for c in gtf.get_CDSs(gtf_fn, genome_dir, '/dev/null')} lengths = [28] transcript = CDSs[gene_name] left_buffer = 30 + codon_buffer * 3 right_buffer = (codon_buffer + 1) * 3 transcript.build_extent_maps(left_buffer, right_buffer) experiment_counts = [] for bam_fn in bam_fns: counts = positions.get_Transcript_extent_position_counts( transcript, bam_fn, lengths, left_buffer=left_buffer, right_buffer=right_buffer, ) experiment_counts.append(counts) # Get the sequence of the extent. extent_sequence = transcript.get_extent_sequence( left_buffer=left_buffer, right_buffer=right_buffer, ) for length in lengths: A_site_offset = positions.A_site_offsets['yeast'][length] length_counts = reduce( operator.add, [counts[length] for counts in experiment_counts]) codon_numbers = np.arange(-codon_buffer, transcript.extent_length / 3 + codon_buffer) # frame_counts_list[i, j] will be the number of RPF's starting at frame i of # codon j frame_counts_list = np.zeros((3, len(codon_numbers)), int) start_codon_locations = [[] for _ in range(3)] stop_codon_locations = [[] for _ in range(3)] for c, codon_number in enumerate(codon_numbers): codon_start = 3 * codon_number for frame in range(3): frame_counts_list[frame, c] = length_counts['start', codon_start + frame - A_site_offset] codon = extent_sequence['start', codon_start + frame:codon_start + frame + 3] codon = ''.join(codon) if codon == start_codon: start_codon_locations[frame].append(codon_number) if codon in stop_codons: stop_codon_locations[frame].append(codon_number) if show_fractions: fig, axs = plt.subplots(4, 1, sharex=True) cumulative_ax = axs[0] frame_axs = axs[1:] else: fig, frame_axs = plt.subplots(3, 1, sharex=True) for frame, (ax, frame_counts) in enumerate( zip(frame_axs, frame_counts_list)): nonzero_codon_numbers = [ c_n for c_n, f_c in zip(codon_numbers, frame_counts) if f_c != 0 ] nonzero_frame_counts = [ f_c for c_n, f_c in zip(codon_numbers, frame_counts) if f_c != 0 ] ax.plot(nonzero_codon_numbers, nonzero_frame_counts, '.') ax.set_ylim(0, frame_counts_list.max() + 1) ax.set_xlim(codon_numbers[0], codon_numbers[-1]) ax.set_title('Frame {0}'.format(frame)) ax.set_ylabel('Read counts') for x in start_codon_locations[frame]: ax.axvspan(x - 0.5, x + 0.5, facecolor='green', edgecolor='none', alpha=0.2) for x in stop_codon_locations[frame]: ax.axvspan(x - 0.5, x + 0.5, facecolor='red', edgecolor='none', alpha=0.2) frame_axs[-1].set_xlabel('Codons from start codon') if show_fractions: frames_so_far = frame_counts_list.cumsum(axis=1) fraction_frames_so_far = np.true_divide( frames_so_far, np.maximum(1, frames_so_far.sum(axis=0)), ) frames_remaining = np.fliplr( np.fliplr(frame_counts_list).cumsum(axis=1)) fraction_frames_remaining = np.true_divide( frames_remaining, np.maximum(1, frames_remaining.sum(axis=0)), ) for frame in [0, 1, 2]: so_far = fraction_frames_so_far[frame] remaining = fraction_frames_remaining[frame] color = colors[frame] cumulative_ax.plot(codon_numbers, so_far, color=color, label='{0} so far'.format(frame)) cumulative_ax.plot(codon_numbers, remaining, color=color, linestyle='--', label='{0} remaining'.format(frame)) cumulative_ax.set_xlim(codon_numbers[0]) cumulative_ax.set_ylim(-0.02, 1.02) cumulative_ax.set_ylabel('Fraction of reads in extent') cumulative_ax.legend(loc='upper right', framealpha=0.5) fig.suptitle('{2}\n{0}\nlength {1} fragments'.format( gene_name, length, exp_name)) return fig
def plot_frameshifts(gtf_fn, bam_fns, gene_name, exp_name, genome_dir, show_fractions=False): codon_buffer = 10 start_codon = "ATG" stop_codons = {"TAA", "TAG", "TGA"} A_site_offset = 5 CDSs = {c.name: c for c in gtf.get_CDSs(gtf_fn, genome_dir, "/dev/null")} lengths = [28] transcript = CDSs[gene_name] left_buffer = 30 + codon_buffer * 3 right_buffer = (codon_buffer + 1) * 3 transcript.build_extent_maps(left_buffer, right_buffer) experiment_counts = [] for bam_fn in bam_fns: counts = positions.get_Transcript_extent_position_counts( transcript, bam_fn, lengths, left_buffer=left_buffer, right_buffer=right_buffer ) experiment_counts.append(counts) # Get the sequence of the extent. extent_sequence = transcript.get_extent_sequence(left_buffer=left_buffer, right_buffer=right_buffer) for length in lengths: A_site_offset = positions.A_site_offsets["yeast"][length] length_counts = reduce(operator.add, [counts[length] for counts in experiment_counts]) codon_numbers = np.arange(-codon_buffer, transcript.extent_length / 3 + codon_buffer) # frame_counts_list[i, j] will be the number of RPF's starting at frame i of # codon j frame_counts_list = np.zeros((3, len(codon_numbers)), int) start_codon_locations = [[] for _ in range(3)] stop_codon_locations = [[] for _ in range(3)] for c, codon_number in enumerate(codon_numbers): codon_start = 3 * codon_number for frame in range(3): frame_counts_list[frame, c] = length_counts["start", codon_start + frame - A_site_offset] codon = extent_sequence["start", codon_start + frame : codon_start + frame + 3] codon = "".join(codon) if codon == start_codon: start_codon_locations[frame].append(codon_number) if codon in stop_codons: stop_codon_locations[frame].append(codon_number) if show_fractions: fig, axs = plt.subplots(4, 1, sharex=True) cumulative_ax = axs[0] frame_axs = axs[1:] else: fig, frame_axs = plt.subplots(3, 1, sharex=True) for frame, (ax, frame_counts) in enumerate(zip(frame_axs, frame_counts_list)): nonzero_codon_numbers = [c_n for c_n, f_c in zip(codon_numbers, frame_counts) if f_c != 0] nonzero_frame_counts = [f_c for c_n, f_c in zip(codon_numbers, frame_counts) if f_c != 0] ax.plot(nonzero_codon_numbers, nonzero_frame_counts, ".") ax.set_ylim(0, frame_counts_list.max() + 1) ax.set_xlim(codon_numbers[0], codon_numbers[-1]) ax.set_title("Frame {0}".format(frame)) ax.set_ylabel("Read counts") for x in start_codon_locations[frame]: ax.axvspan(x - 0.5, x + 0.5, facecolor="green", edgecolor="none", alpha=0.2) for x in stop_codon_locations[frame]: ax.axvspan(x - 0.5, x + 0.5, facecolor="red", edgecolor="none", alpha=0.2) frame_axs[-1].set_xlabel("Codons from start codon") if show_fractions: frames_so_far = frame_counts_list.cumsum(axis=1) fraction_frames_so_far = np.true_divide(frames_so_far, np.maximum(1, frames_so_far.sum(axis=0))) frames_remaining = np.fliplr(np.fliplr(frame_counts_list).cumsum(axis=1)) fraction_frames_remaining = np.true_divide(frames_remaining, np.maximum(1, frames_remaining.sum(axis=0))) for frame in [0, 1, 2]: so_far = fraction_frames_so_far[frame] remaining = fraction_frames_remaining[frame] color = colors[frame] cumulative_ax.plot(codon_numbers, so_far, color=color, label="{0} so far".format(frame)) cumulative_ax.plot( codon_numbers, remaining, color=color, linestyle="--", label="{0} remaining".format(frame) ) cumulative_ax.set_xlim(codon_numbers[0]) cumulative_ax.set_ylim(-0.02, 1.02) cumulative_ax.set_ylabel("Fraction of reads in extent") cumulative_ax.legend(loc="upper right", framealpha=0.5) fig.suptitle("{2}\n{0}\nlength {1} fragments".format(gene_name, length, exp_name)) return fig