def compute_metanucleotide_counts(read_positions): left_buffer = 50 right_buffer = 50 # Figure out what lengths were recorded. length_keys, _, _ = extract_lengths_and_buffers(read_positions) dinucleotides = list(''.join(pair) for pair in product('TCAG', repeat=2)) features_keys = codons.all_codons# + ['T', 'C', 'A', 'G'] + dinucleotides metacodon_counts = {features_key: {length: PositionCounts({'feature': 0}, left_buffer, right_buffer) for length in length_keys } for features_key in features_keys} feature_slice = ('feature', slice(-left_buffer, right_buffer)) for name, read_counts in read_positions.iteritems(): transcript_sequence = read_counts['sequence'] coding_sequence = ''.join(transcript_sequence['start_codon':('stop_codon', 3)]) for c, codon_id in enumerate(codons.codons_from_seq(coding_sequence)): p = 3 * c if p >= left_buffer and p <= len(coding_sequence) - right_buffer: p_slice = ('start_codon', slice(p - left_buffer, p + left_buffer)) for length in length_keys: counts = read_positions[name][length][p_slice] metacodon_counts[codon_id][length][feature_slice] += counts #nucleotide = codon_id[0] #metacodon_counts[nucleotide][length][feature_slice] += counts #dinucleotide = codon_id[:2] #metacodon_counts[dinucleotide][length][feature_slice] += counts return metacodon_counts
def compute_codon_counts(position_counts, offset_type): CDS_length = position_counts.values()[0].CDS_length if CDS_length % 3 != 0: raise ValueError('CDS length not divisible by 3') # Note: CDS_length is the index of the first nucleotide of the stop codon. # Ingolia's original model never has the stop codon in the A site, but # subsequent data show an accumulation of (typically length 29 or 30) reads # that do advance this far. num_codons = CDS_length // 3 landmarks = { 'start_codon': 0, 'stop_codon': num_codons, } codon_counts = PositionCounts(landmarks, codon_buffer, codon_buffer) recorded_lengths = set(position_counts.keys()) known_A_site_lengths = set(A_site_offsets[offset_type].keys()) for length in recorded_lengths & known_A_site_lengths: A_site_offset = A_site_offsets[offset_type][length] start_index = -A_site_offset - (codon_buffer * 3) end_index = CDS_length - A_site_offset + (codon_buffer * 3) in_frame = slice(start_index, end_index, 3) one_behind = slice(start_index - 1, end_index - 1, 3) one_ahead = slice(start_index + 1, end_index + 1, 3) codon_counts.data += position_counts[length]['start_codon', in_frame] + \ position_counts[length]['start_codon', one_behind] + \ position_counts[length]['start_codon', one_ahead] sequence_slice = slice(('start_codon', -codon_buffer * 3), ('stop_codon', codon_buffer * 3)) sequence = ''.join(position_counts['sequence'][sequence_slice]) codon_identities_list = list(codons.codons_from_seq(sequence)) codon_identities = PositionCounts( landmarks, codon_buffer, codon_buffer, data=np.asarray(codon_identities_list), ) return codon_counts, codon_identities
def compute_metanucleotide_counts(read_positions): left_buffer = 50 right_buffer = 50 # Figure out what lengths were recorded. length_keys, _, _ = extract_lengths_and_buffers(read_positions) dinucleotides = list(''.join(pair) for pair in product('TCAG', repeat=2)) features_keys = codons.all_codons # + ['T', 'C', 'A', 'G'] + dinucleotides metacodon_counts = { features_key: { length: PositionCounts({'feature': 0}, left_buffer, right_buffer) for length in length_keys } for features_key in features_keys } feature_slice = ('feature', slice(-left_buffer, right_buffer)) for name, read_counts in read_positions.iteritems(): transcript_sequence = read_counts['sequence'] coding_sequence = ''.join( transcript_sequence['start_codon':('stop_codon', 3)]) for c, codon_id in enumerate(codons.codons_from_seq(coding_sequence)): p = 3 * c if p >= left_buffer and p <= len(coding_sequence) - right_buffer: p_slice = ('start_codon', slice(p - left_buffer, p + left_buffer)) for length in length_keys: counts = read_positions[name][length][p_slice] metacodon_counts[codon_id][length][feature_slice] += counts #nucleotide = codon_id[0] #metacodon_counts[nucleotide][length][feature_slice] += counts #dinucleotide = codon_id[:2] #metacodon_counts[dinucleotide][length][feature_slice] += counts return metacodon_counts
def compute_codon_counts(position_counts, offset_type): CDS_length = position_counts.values()[0].CDS_length if CDS_length % 3 != 0: raise ValueError('CDS length not divisible by 3') # Note: CDS_length is the index of the first nucleotide of the stop codon. # Ingolia's original model never has the stop codon in the A site, but # subsequent data show an accumulation of (typically length 29 or 30) reads # that do advance this far. num_codons = CDS_length // 3 landmarks = {'start_codon': 0, 'stop_codon': num_codons, } codon_counts = PositionCounts(landmarks, codon_buffer, codon_buffer) recorded_lengths = set(position_counts.keys()) known_A_site_lengths = set(A_site_offsets[offset_type].keys()) for length in recorded_lengths & known_A_site_lengths: A_site_offset = A_site_offsets[offset_type][length] start_index = -A_site_offset - (codon_buffer * 3) end_index = CDS_length - A_site_offset + (codon_buffer * 3) in_frame = slice(start_index, end_index, 3) one_behind = slice(start_index - 1, end_index - 1, 3) one_ahead = slice(start_index + 1, end_index + 1, 3) codon_counts.data += position_counts[length]['start_codon', in_frame] + \ position_counts[length]['start_codon', one_behind] + \ position_counts[length]['start_codon', one_ahead] sequence_slice = slice(('start_codon', -codon_buffer * 3), ('stop_codon', codon_buffer * 3)) sequence = ''.join(position_counts['sequence'][sequence_slice]) codon_identities_list = list(codons.codons_from_seq(sequence)) codon_identities = PositionCounts(landmarks, codon_buffer, codon_buffer, data = np.asarray(codon_identities_list), ) return codon_counts, codon_identities