def read_file(file_name, specific_keys=None, show_progress=False): genes = {} with h5py.File(file_name, 'r') as hdf5_file: gene_names = hdf5_file.keys() if show_progress: gene_names = utilities.progress_bar(len(gene_names), gene_names) for gene_name in gene_names: genes[gene_name] = build_gene(hdf5_file[gene_name], specific_keys) return genes
def call_3p_peaks(): gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' output_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt' region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) CDS_dict = {t.name: t for t in CDSs} experiments = build_all_experiments(verbose=False) three_prime_experiments = [(n, e) for n, e in sorted(experiments['three_p_seq']['three_p_seq'].items())] + \ [(n, e) for n, e in sorted(experiments['three_t_fill_seq']['wilkening_nar'].items()) if '3tfill_ypd_rep1' in n] + \ [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4'] argmaxes = {} fractions = {} joints = {} for name, experiment in three_prime_experiments: print name argmaxes[name] = {} fractions[name] = [] joints[name] = [] fn = experiment.file_names['three_prime_read_positions'] f = h5py.File(fn, 'r') for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(0, 400) argmax = gene['all'].argmax_over_slice('stop_codon', xs) argmaxes[name][transcript.name] = argmax most = gene['all']['stop_codon', argmax] total = gene['all']['stop_codon', xs].sum() if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction)) with open(output_fn, 'w') as output_fh: name_order = sorted(argmaxes['Cerevisiae_3Pseq'], key=argmaxes['Cerevisiae_3Pseq'].get) for name in name_order: output_fh.write('{0}\t'.format(str(CDS_dict[name]))) for exp_name, _ in three_prime_experiments: output_fh.write('{0}\t'.format(argmaxes[exp_name][name])) output_fh.write('\n')
def produce_transcript_base_compositions(): gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' CDSs = gff.get_CDSs(gff_fn, genome_dir) left_buffer = 500 right_buffer = 500 genes = {} windows = [5, 10, 20] for transcript in utilities.progress_bar(len(CDSs), CDSs): genes[transcript.name] = {} transcript.build_coordinate_maps() landmarks = { 'start': 0, 'start_codon': transcript.transcript_start_codon, 'stop_codon': transcript.transcript_stop_codon, 'end': transcript.transcript_length, } sequence = transcript.get_transcript_sequence(left_buffer, right_buffer) A_locations = positions.PositionCounts( landmarks, left_buffer, right_buffer, data=(sequence.data == 'A'), ) for window in windows: recent_As = positions.PositionCounts( landmarks, left_buffer, right_buffer, ) for left_edge in range( -left_buffer, transcript.CDS_length + right_buffer - window): num_As = sum(A_locations['start', left_edge:left_edge + window]) recent_As['start', left_edge] = num_As genes[transcript.name][window] = recent_As transcript.delete_coordinate_maps() Serialize.read_positions.write_file(genes, composition_fn)
def counts_from_read_positions_fn(read_positions_fn, key='all'): hdf5_file = h5py.File(read_positions_fn, 'r') progress = utilities.progress_bar(len(hdf5_file), hdf5_file) for gene_name in progress: #if gene_name == 'YLR256W': # continue #if gene_name in {'YLR249W', 'YPL106C', 'YGL008C'}: # continue if key == 'nonzero': gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={'all', '0'}) nonzero_counts = gene['all'] - gene[0] yield gene_name, nonzero_counts else: gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={str(key)}) counts = gene[key] yield gene_name, counts
def counts_from_read_positions_fn(read_positions_fn, key='all'): hdf5_file = h5py.File(read_positions_fn, 'r') progress = utilities.progress_bar(len(hdf5_file), hdf5_file) for gene_name in progress: #if gene_name == 'YLR256W': # continue #if gene_name in {'YLR249W', 'YPL106C', 'YGL008C'}: # continue if key == 'nonzero': gene = Serialize.read_positions.build_gene( hdf5_file[gene_name], specific_keys={'all', '0'}) nonzero_counts = gene['all'] - gene[0] yield gene_name, nonzero_counts else: gene = Serialize.read_positions.build_gene( hdf5_file[gene_name], specific_keys={str(key)}) counts = gene[key] yield gene_name, counts
def call_5p_peaks(): gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf" genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/" region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) experiments = build_all_experiments(verbose=False) five_prime_experiments = ( [(n, e) for n, e in sorted(experiments["TL_seq"]["arribere_gr"].items()) if "TLSeq1" in n] + [(n, e) for n, e in sorted(experiments["TL_seq"]["park_nar"].items()) if n == "SMORE-seq_WT_TAP+_rep1"] + [ (n, e) for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items()) if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4" ] ) argmaxes = {} fractions = {} joints = {} for name, experiment in five_prime_experiments: print name argmaxes[name] = Counter() fractions[name] = [] joints[name] = [] fn = experiment.file_names["five_prime_read_positions"] f = h5py.File(fn, "r") for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(-300, 0) argmax = gene["all"].argmax_over_slice("start_codon", xs) argmaxes[name][argmax] += 1 most = gene["all"]["start_codon", argmax] total = gene["all"]["start_codon", xs].sum() if total == 0: print transcript if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction))
def look_at_densities(): import ribosome_profiling_experiment description_fn = '/home/jah/projects/ribosomes/experiments/weinberg/RiboZero/job/description.txt' exp = ribosome_profiling_experiment.RibosomeProfilingExperiment.from_description_file_name(description_fn) names = [] zero_ratios = [] hdf5_file = h5py.File(exp.file_names['three_prime_read_positions'], 'r') progress = utilities.progress_bar(len(hdf5_file), hdf5_file) for gene_name in progress: gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={'0'}) zero_counts = gene[0] before = zero_counts['polyA', -100:1].sum() after = zero_counts['polyA', 1:102].sum() names.append(gene_name) zero_ratios.append((before, after)) return names, zero_ratios
def call_5p_peaks(): gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) experiments = build_all_experiments(verbose=False) five_prime_experiments = [(n, e) for n, e in sorted(experiments['TL_seq']['arribere_gr'].items()) if 'TLSeq1' in n] + \ [(n, e) for n, e in sorted(experiments['TL_seq']['park_nar'].items()) if n == 'SMORE-seq_WT_TAP+_rep1'] + \ [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4'] argmaxes = {} fractions = {} joints = {} for name, experiment in five_prime_experiments: print name argmaxes[name] = Counter() fractions[name] = [] joints[name] = [] fn = experiment.file_names['five_prime_read_positions'] f = h5py.File(fn, 'r') for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(-300, 0) argmax = gene['all'].argmax_over_slice('start_codon', xs) argmaxes[name][argmax] += 1 most = gene['all']['start_codon', argmax] total = gene['all']['start_codon', xs].sum() if total == 0: print transcript if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction))
def produce_transcript_base_compositions(): gff_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff" genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/" composition_fn = ( "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5" ) CDSs = gff.get_CDSs(gff_fn, genome_dir) left_buffer = 500 right_buffer = 500 genes = {} windows = [5, 10, 20] for transcript in utilities.progress_bar(len(CDSs), CDSs): genes[transcript.name] = {} transcript.build_coordinate_maps() landmarks = { "start": 0, "start_codon": transcript.transcript_start_codon, "stop_codon": transcript.transcript_stop_codon, "end": transcript.transcript_length, } sequence = transcript.get_transcript_sequence(left_buffer, right_buffer) A_locations = positions.PositionCounts(landmarks, left_buffer, right_buffer, data=(sequence.data == "A")) for window in windows: recent_As = positions.PositionCounts(landmarks, left_buffer, right_buffer) for left_edge in range(-left_buffer, transcript.CDS_length + right_buffer - window): num_As = sum(A_locations["start", left_edge : left_edge + window]) recent_As["start", left_edge] = num_As genes[transcript.name][window] = recent_As transcript.delete_coordinate_maps() Serialize.read_positions.write_file(genes, composition_fn)
updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to( mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping if __name__ == '__main__': fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq' seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))] seqs = utilities.progress_bar(len(seqs), seqs) adapter = full_linker count = 0 counts = Counter() for seq in seqs: counts[trim_by_local_alignment(adapter, seq)] += 1
def call_UTR_boundaries(boundaries_fn, diagnostic_fn='/dev/null'): experiments = build_all_experiments(verbose=False) five_prime_exp = experiments['TL_seq']['arribere_gr']['S288C_TLSeq1'] three_prime_exp = experiments['three_p_seq']['three_p_seq']['Cerevisiae_3Pseq'] other_five_prime_exps = [experiments['TL_seq']['park_nar']['SMORE-seq_WT_TAP+_rep1'], experiments['TIF_seq']['pelechano_nature']['ypd_bio1_lib1'], ] other_three_prime_exps = [experiments['three_t_fill_seq']['wilkening_nar']['3tfill_ypd_rep1'], experiments['TIF_seq']['pelechano_nature']['ypd_bio1_lib1'], ] five_prime_fh = h5py.File(five_prime_exp.file_names['five_prime_read_positions'], 'r') three_prime_fh = h5py.File(three_prime_exp.file_names['three_prime_read_positions'], 'r') other_five_prime_fhs = [h5py.File(exp.file_names['five_prime_read_positions'], 'r') for exp in other_five_prime_exps] other_three_prime_fhs = [h5py.File(exp.file_names['three_prime_read_positions'], 'r') for exp in other_three_prime_exps] transcripts, _ = five_prime_exp.get_CDSs() UTR_boundaries = {} with open(diagnostic_fn, 'w') as diagnostic_fh: progress = utilities.progress_bar(len(transcripts), sorted(transcripts)) for transcript in progress: name = transcript.name transcript.build_coordinate_maps(left_buffer=500, right_buffer=500) five_prime_gene = Serialize.read_positions.build_gene(five_prime_fh[name], specific_keys={'all'}) other_genes = [Serialize.read_positions.build_gene(other_fh[name], specific_keys={'all'}) for other_fh in other_five_prime_fhs] five_xs = np.arange(-500, transcript.CDS_length) five_slice = ('start_codon', five_xs) five_counts = five_prime_gene['all'] five_sum = five_counts[five_slice].sum() if five_sum == 0: five_offset = 0 else: five_offset = five_counts.argmax_over_slice('start_codon', five_xs) n_largest = five_counts.n_largest_over_slice(10, five_slice) five_prime_diagnostic = [] for i in n_largest: row = [] for gene in [five_prime_gene] + other_genes: count = gene['all']['start_codon', i] total = gene['all'][five_slice].sum() if row == []: genomic = transcript.transcript_to_genomic[transcript.transcript_start_codon + i] row.append('{0}\t({1:,})\t'.format(i, genomic)) row.append('{0}\t{1:0.2%}'.format(count, count / float(total))) five_prime_diagnostic.append('\t'.join(row)) five_prime_diagnostic = '\n'.join(five_prime_diagnostic) three_prime_gene = Serialize.read_positions.build_gene(three_prime_fh[name], specific_keys={'all', '0'}) other_genes = [Serialize.read_positions.build_gene(other_fh[name], specific_keys={'all', '0'}) for other_fh in other_three_prime_fhs] three_xs = np.arange(-transcript.CDS_length, 500) three_slice = ('stop_codon', three_xs) three_counts = three_prime_gene['all']# - three_prime_gene[0] three_sum = three_counts[three_slice].sum() if three_sum == 0: three_offset = 3 else: three_offset = three_counts.argmax_over_slice('stop_codon', three_xs) n_largest = three_counts.n_largest_over_slice(10, three_slice) three_prime_diagnostic = [] for i in n_largest: row = [] for gene in [three_prime_gene] + other_genes: count = gene['all']['stop_codon', i] total = gene['all'][three_slice].sum() if row == []: genomic = transcript.transcript_to_genomic[transcript.transcript_stop_codon + i] row.append('{0}\t({1:,})\t'.format(i, genomic)) row.append('{0}\t{1:0.2%}'.format(count, count / float(total))) three_prime_diagnostic.append('\t'.join(row)) three_prime_diagnostic = '\n'.join(three_prime_diagnostic) diagnostic_fh.write('{0}\n'.format(str(transcript))) diagnostic_fh.write('{0}\n'.format(five_prime_diagnostic)) diagnostic_fh.write('\n') diagnostic_fh.write('{0}\n'.format(three_prime_diagnostic)) diagnostic_fh.write('\n') five_pos = transcript.transcript_to_genomic[transcript.transcript_start_codon + five_offset] three_pos = transcript.transcript_to_genomic[transcript.transcript_stop_codon + three_offset] transcript.delete_coordinate_maps() UTR_boundaries[name] = (transcript.seqname, transcript.strand, five_pos, three_pos) write_UTR_file(UTR_boundaries, boundaries_fn)
def call_3p_peaks(): gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf" genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/" composition_fn = ( "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5" ) output_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt" region_fetcher = genomes.build_region_fetcher(genome_dir) CDSs = gtf.get_CDSs(gtf_fn) CDS_dict = {t.name: t for t in CDSs} experiments = build_all_experiments(verbose=False) three_prime_experiments = ( [(n, e) for n, e in sorted(experiments["three_p_seq"]["three_p_seq"].items())] + [ (n, e) for n, e in sorted(experiments["three_t_fill_seq"]["wilkening_nar"].items()) if "3tfill_ypd_rep1" in n ] + [ (n, e) for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items()) if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4" ] ) argmaxes = {} fractions = {} joints = {} for name, experiment in three_prime_experiments: print name argmaxes[name] = {} fractions[name] = [] joints[name] = [] fn = experiment.file_names["three_prime_read_positions"] f = h5py.File(fn, "r") for transcript in utilities.progress_bar(len(CDSs), CDSs): if transcript.name not in f: continue gene = Serialize.read_positions.build_gene(f[transcript.name]) xs = np.arange(0, 400) argmax = gene["all"].argmax_over_slice("stop_codon", xs) argmaxes[name][transcript.name] = argmax most = gene["all"]["stop_codon", argmax] total = gene["all"]["stop_codon", xs].sum() if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction)) with open(output_fn, "w") as output_fh: name_order = sorted(argmaxes["Cerevisiae_3Pseq"], key=argmaxes["Cerevisiae_3Pseq"].get) for name in name_order: output_fh.write("{0}\t".format(str(CDS_dict[name]))) for exp_name, _ in three_prime_experiments: output_fh.write("{0}\t".format(argmaxes[exp_name][name])) output_fh.write("\n")
composition_fn = ( "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5" ) CDSs = gff.get_CDSs(gff_fn, genome_dir) import select_work exps = select_work.build_all_experiments(verbose=False) reads_fn = exps["belgium_2014_12_10"]["WT_1_mRNA"].file_names["three_prime_read_positions"] reads_fh = h5py.File(reads_fn, "r") meta_counts = positions.PositionCounts({"A": 0}, left_buffer=100000, right_buffer=100000) f = h5py.File(composition_fn, "r") for t in utilities.progress_bar(len(CDSs), CDSs): if t.name not in reads_fh: continue gene = Serialize.read_positions.build_gene(f[t.name]) t.build_coordinate_maps() if t.transcript_length < 301: continue end = t.transcript_length - 200 sl = ("start", np.arange(100, end)) A_rich_position = gene[10].argmax_over_slice(*sl) if gene[10]["start", A_rich_position] > 9: counts = Serialize.read_positions.build_gene(reads_fh[t.name]) before_counts = counts["all"]["start", 0:A_rich_position] after_counts = counts["all"]["start", A_rich_position : A_rich_position + 200] meta_counts["A", -len(before_counts) : 0] += before_counts
composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' CDSs = gff.get_CDSs(gff_fn, genome_dir) import select_work exps = select_work.build_all_experiments(verbose=False) reads_fn = exps['belgium_2014_12_10']['WT_1_mRNA'].file_names[ 'three_prime_read_positions'] reads_fh = h5py.File(reads_fn, 'r') meta_counts = positions.PositionCounts({'A': 0}, left_buffer=100000, right_buffer=100000) f = h5py.File(composition_fn, 'r') for t in utilities.progress_bar(len(CDSs), CDSs): if t.name not in reads_fh: continue gene = Serialize.read_positions.build_gene(f[t.name]) t.build_coordinate_maps() if t.transcript_length < 301: continue end = t.transcript_length - 200 sl = ('start', np.arange(100, end)) A_rich_position = gene[10].argmax_over_slice(*sl) if gene[10]['start', A_rich_position] > 9: counts = Serialize.read_positions.build_gene(reads_fh[t.name]) before_counts = counts['all']['start', 0:A_rich_position] after_counts = counts['all']['start', A_rich_position:A_rich_position + 200]
# Remove blocks from the beginning. trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length) updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping if __name__ == '__main__': fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq' seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))] seqs = utilities.progress_bar(len(seqs), seqs) adapter = full_linker count = 0 counts = Counter() for seq in seqs: counts[trim_by_local_alignment(adapter, seq)] += 1