def get_CDSs(self, force_all=False): all_CDSs = gff.get_CDSs( self.file_names['genes'], self.file_names['genome'], ) if self.transcripts_file_name == None: CDSs = all_CDSs else: transcripts = { line.strip() for line in open(self.transcripts_file_name) } CDSs = [t for t in all_CDSs if t.name in transcripts] max_gene_length = 0 for CDS in CDSs: CDS.build_coordinate_maps() max_gene_length = max(max_gene_length, CDS.transcript_length) CDS.delete_coordinate_maps() if force_all: piece_CDSs = CDSs else: piece_CDSs = piece_of_list(CDSs, self.num_pieces, self.which_piece) return piece_CDSs, max_gene_length
def produce_transcript_base_compositions(): gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' CDSs = gff.get_CDSs(gff_fn, genome_dir) left_buffer = 500 right_buffer = 500 genes = {} windows = [5, 10, 20] for transcript in utilities.progress_bar(len(CDSs), CDSs): genes[transcript.name] = {} transcript.build_coordinate_maps() landmarks = { 'start': 0, 'start_codon': transcript.transcript_start_codon, 'stop_codon': transcript.transcript_stop_codon, 'end': transcript.transcript_length, } sequence = transcript.get_transcript_sequence(left_buffer, right_buffer) A_locations = positions.PositionCounts( landmarks, left_buffer, right_buffer, data=(sequence.data == 'A'), ) for window in windows: recent_As = positions.PositionCounts( landmarks, left_buffer, right_buffer, ) for left_edge in range( -left_buffer, transcript.CDS_length + right_buffer - window): num_As = sum(A_locations['start', left_edge:left_edge + window]) recent_As['start', left_edge] = num_As genes[transcript.name][window] = recent_As transcript.delete_coordinate_maps() Serialize.read_positions.write_file(genes, composition_fn)
def get_CDSs(self, force_all=False): all_CDSs = gff.get_CDSs(self.file_names["genes"], self.file_names["genome"]) if self.transcripts_file_name == None: CDSs = all_CDSs else: transcripts = {line.strip() for line in open(self.transcripts_file_name)} CDSs = [t for t in all_CDSs if t.name in transcripts] max_gene_length = 0 for CDS in CDSs: CDS.build_coordinate_maps() max_gene_length = max(max_gene_length, CDS.transcript_length) CDS.delete_coordinate_maps() if force_all: piece_CDSs = CDSs else: piece_CDSs = piece_of_list(CDSs, self.num_pieces, self.which_piece) return piece_CDSs, max_gene_length
def produce_transcript_base_compositions(): gff_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff" genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/" composition_fn = ( "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5" ) CDSs = gff.get_CDSs(gff_fn, genome_dir) left_buffer = 500 right_buffer = 500 genes = {} windows = [5, 10, 20] for transcript in utilities.progress_bar(len(CDSs), CDSs): genes[transcript.name] = {} transcript.build_coordinate_maps() landmarks = { "start": 0, "start_codon": transcript.transcript_start_codon, "stop_codon": transcript.transcript_stop_codon, "end": transcript.transcript_length, } sequence = transcript.get_transcript_sequence(left_buffer, right_buffer) A_locations = positions.PositionCounts(landmarks, left_buffer, right_buffer, data=(sequence.data == "A")) for window in windows: recent_As = positions.PositionCounts(landmarks, left_buffer, right_buffer) for left_edge in range(-left_buffer, transcript.CDS_length + right_buffer - window): num_As = sum(A_locations["start", left_edge : left_edge + window]) recent_As["start", left_edge] = num_As genes[transcript.name][window] = recent_As transcript.delete_coordinate_maps() Serialize.read_positions.write_file(genes, composition_fn)
total = gene["all"]["start_codon", xs].sum() if total == 0: print transcript if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction)) if __name__ == "__main__": gff_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff" genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/" composition_fn = ( "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5" ) CDSs = gff.get_CDSs(gff_fn, genome_dir) import select_work exps = select_work.build_all_experiments(verbose=False) reads_fn = exps["belgium_2014_12_10"]["WT_1_mRNA"].file_names["three_prime_read_positions"] reads_fh = h5py.File(reads_fn, "r") meta_counts = positions.PositionCounts({"A": 0}, left_buffer=100000, right_buffer=100000) f = h5py.File(composition_fn, "r") for t in utilities.progress_bar(len(CDSs), CDSs): if t.name not in reads_fh: continue gene = Serialize.read_positions.build_gene(f[t.name])
argmaxes[name][argmax] += 1 most = gene['all']['start_codon', argmax] total = gene['all']['start_codon', xs].sum() if total == 0: print transcript if total > 9: fraction = np.true_divide(most, total) fractions[name].append(fraction) joints[name].append((argmax, fraction)) if __name__ == '__main__': gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff' genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/' composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5' CDSs = gff.get_CDSs(gff_fn, genome_dir) import select_work exps = select_work.build_all_experiments(verbose=False) reads_fn = exps['belgium_2014_12_10']['WT_1_mRNA'].file_names[ 'three_prime_read_positions'] reads_fh = h5py.File(reads_fn, 'r') meta_counts = positions.PositionCounts({'A': 0}, left_buffer=100000, right_buffer=100000) f = h5py.File(composition_fn, 'r') for t in utilities.progress_bar(len(CDSs), CDSs): if t.name not in reads_fh: