def __init__(self, r_rna, temp=37.0): self.__r_rna = r_rna.upper() self.__runner = NuPackRunner(temp) self.__optimal_spacing = 5 self.__cutoff = 35
class RbsCalculator(object): '''Class for calculating RBS.''' def __init__(self, r_rna, temp=37.0): self.__r_rna = r_rna.upper() self.__runner = NuPackRunner(temp) self.__optimal_spacing = 5 self.__cutoff = 35 def calc_dgs(self, m_rna, limit=float('inf')): ''''Calculates each dg term in the free energy model and sums them to create dg_total.''' m_rna = m_rna.upper() start_positions = [] dgs = [] count = 0 for match in re.finditer(seq_utils.START_CODON_PATT, m_rna, overlapped=True): start_pos = match.start() d_g = self.__calc_dg(m_rna, start_pos) start_positions.append(start_pos) dgs.append(d_g) count += 1 if count == limit: break return start_positions, dgs def calc_kinetic_score(self, m_rna, start_pos, dangles='none'): '''Gets kinetic score.''' sub_m_rna = \ m_rna[max(0, start_pos - self.__cutoff):min(len(m_rna), start_pos + self.__cutoff)] _, bp_xs, bp_ys = self.__runner.mfe([sub_m_rna], dangles=dangles) largest_range_helix = 0 for (nt_x, nt_y) in zip(bp_xs[0], bp_ys[0]): if nt_x <= len(sub_m_rna) and nt_y <= len(sub_m_rna): val = nt_y - nt_x largest_range_helix = max(val, largest_range_helix) return float(largest_range_helix) / float(len(sub_m_rna)) def get_initial_rbs(self, cds, dg_target_rel): '''Generates random initial condition for designing a synthetic rbs sequence.''' cds = cds.upper() dg_range_high = 25.0 dg_range_low = -18.0 dg_target_rel = (dg_target_rel - dg_range_high) / \ (dg_range_low - dg_range_high) # 0.0: Low expression # 1.0: High expression if dg_target_rel < 0.125: prob_shine_delgano = 0.50 core_length = 4 max_nonoptimal_spacing = 10 elif dg_target_rel < 0.250: prob_shine_delgano = 0.50 core_length = 4 max_nonoptimal_spacing = 10 elif dg_target_rel < 0.5: prob_shine_delgano = 0.75 core_length = 4 max_nonoptimal_spacing = 10 elif dg_target_rel < 0.7: prob_shine_delgano = 0.75 core_length = 4 max_nonoptimal_spacing = 5 elif dg_target_rel < 0.8: prob_shine_delgano = 0.75 core_length = 6 max_nonoptimal_spacing = 5 elif dg_target_rel < 0.9: prob_shine_delgano = 0.90 core_length = 6 max_nonoptimal_spacing = 5 elif dg_target_rel < 0.95: prob_shine_delgano = 0.90 core_length = 8 max_nonoptimal_spacing = 3 else: prob_shine_delgano = 1.0 core_length = 9 max_nonoptimal_spacing = 2 shine_delgano = Seq(self.__r_rna).reverse_complement() return self.__get_random_rbs(shine_delgano, prob_shine_delgano, core_length, max_nonoptimal_spacing) def __calc_dg(self, m_rna, start_pos): '''Calculates dG.''' # Set dangles based on length between 5' end of m_rna and start codon: if start_pos > MAX_RBS_LENGTH: dangles = 'none' else: dangles = 'all' # Start codon energy: start_codon_energies = {'ATG': -1.194, 'GTG': -0.0748, 'TTG': -0.0435, 'CTG': -0.03406} dg_start = start_codon_energies[m_rna[start_pos:start_pos + 3]] # Energy of m_rna folding: [dg_m_rna, _, _] = \ self.__calc_dg_m_rna(m_rna, start_pos, dangles) # Energy of m_rna:r_rna hybridization and folding: [dg_m_rna_r_rna, m_rna_subseq, bp_x, bp_y, energy_before] = \ self.__calc_dg_m_rna_r_rna(m_rna, start_pos, dangles) # Standby site correction: dg_standby = self.__calc_dg_standby_site(m_rna_subseq, bp_x, bp_y, energy_before, dangles) # Total energy is m_rna:r_rna + start - r_rna - m_rna - standby_site: return dg_m_rna_r_rna + dg_start - dg_m_rna - dg_standby def __calc_dg_m_rna(self, m_rna, start_pos, dangles='all'): '''Calculates the dg_m_rna given the m_rna sequence.''' m_rna_subseq = \ m_rna[max(0, start_pos - self.__cutoff):min(len(m_rna), start_pos + self.__cutoff)] energies, bp_xs, bp_ys = self.__runner.mfe([m_rna_subseq], dangles=dangles) return energies[0], bp_xs[0], bp_ys[0] def __calc_dg_m_rna_r_rna(self, m_rna, start_pos, dangles): '''Calculates the dg_m_rna_r_rna from the m_rna and r_rna sequence. Considers all feasible 16S r_rna binding sites and includes the effects of non-optimal spacing.''' energy_cutoff = 3.0 # Footprint of the 30S complex that prevents formation of secondary # structures downstream of the start codon. Here, we assume that the # entire post-start RNA sequence does not form secondary structures # once the 30S complex has bound. footprint = 1000 begin = max(0, start_pos - self.__cutoff) m_rna_len = min(len(m_rna), start_pos + self.__cutoff) start_pos_in_subsequence = min(start_pos, self.__cutoff) startpos_to_end_len = m_rna_len - start_pos_in_subsequence - begin # 1. identify a list of r_rna-binding sites. Binding sites are # hybridizations between the m_rna and r_rna and can include # mismatches, bulges, etc. Intra-molecular folding is also allowed # within the m_rna. # The subopt program is used to generate a list of optimal & suboptimal # binding sites. # Constraints: the entire r_rna-binding site must be upstream of the # start codon m_rna_subseq = m_rna[begin:start_pos] if len(m_rna_subseq) == 0: raise ValueError('Warning: There is a leaderless start codon, ' + 'which is being ignored.') # print 'After exception' energies, bp_xs, bp_ys = self.__runner.subopt([m_rna_subseq, self.__r_rna], energy_cutoff, dangles=dangles) if len(bp_xs) == 0: raise ValueError( 'Warning: The 16S r_rna has no predicted binding site. ' + 'Start codon is considered as leaderless and ignored.') # 2. Calculate dg_spacing for each 16S r_rna binding site # Calculate the aligned spacing for each binding site in the list aligned_spacing = [] for (bp_x, bp_y) in zip(bp_xs, bp_ys): aligned_spacing.append( self.__calc_aligned_spacing(m_rna_subseq, start_pos_in_subsequence, bp_x, bp_y)) dg_spacing_list = [] dg_m_rna_r_rna = [] dg_m_rna_r_rna_spacing = [] # Calculate dg_spacing using aligned spacing value. Add it to # dg_m_rna_r_rna. for counter in range(len(bp_xs)): dg_m_rna_r_rna.append(energies[counter]) val = self.__calc_dg_spacing(aligned_spacing[counter]) dg_spacing_list.append(val) dg_m_rna_r_rna_spacing.append( val + energies[counter]) # 3. Find 16S r_rna binding site that minimizes # dg_spacing+dg_m_rna_r_rna. index = dg_m_rna_r_rna_spacing.index(min(dg_m_rna_r_rna_spacing)) dg_spacing_final = dg_spacing_list[index] # Check: Is the dg spacing large compared to the energy gap? If so, # this means the list of suboptimal 16S r_rna binding sites generated # by subopt is too short. # if dg_spacing_final > energy_cutoff: # print 'Warning: The spacing penalty is greater than the ' + \ # 'energy gap. dg (spacing) = ', dg_spacing_final # 4. Identify the 5' and 3' ends of the identified 16S r_rna binding # site. Create a base pair list. most_5p_m_rna = float('inf') most_3p_m_rna = -float('inf') # Generate a list of r_rna-m_rna base paired nucleotides bp_x_target = [] bp_y_target = [] bp_x = bp_xs[index] bp_y = bp_ys[index] for (nt_x, nt_y) in zip(bp_x, bp_y): if nt_y > len(m_rna_subseq): # nt is r_rna most_5p_m_rna = min(most_5p_m_rna, bp_x[bp_y.index(nt_y)]) most_3p_m_rna = max(most_3p_m_rna, bp_x[bp_y.index(nt_y)]) bp_x_target.append(nt_x) bp_y_target.append(nt_y) # The r_rna-binding site is between the nucleotides at positions # most_5p_m_rna and most_3p_m_rna # Now, fold the pre-sequence, r_rna-binding-sequence and post-sequence # separately. Take their base pairings and combine them together. # Calculate the total energy. For secondary structures, this splitting # operation is allowed. # We postulate that not all of the post-sequence can form secondary # structures. Once the 30S complex binds to the m_rna, it prevents the # formation of secondary structures that are mutually exclusive with # ribosome binding. We define self.footprint to be the length of the # 30S complex footprint. Here, we assume that the entire m_rna sequence # downstream of the 16S r_rna binding site can not form secondary # structures. m_rna_pre = m_rna[begin:begin + most_5p_m_rna - 1] post_window_end = m_rna_len + 1 post_window_begin = min( start_pos + footprint, post_window_end) # Footprint post_window_end = m_rna_len + 1 m_rna_post = m_rna[post_window_begin:post_window_end] total_bp_x = [] total_bp_y = [] # Calculate pre-sequence folding if len(m_rna_pre) > 0: _, bp_xs, bp_ys = self.__runner.mfe([m_rna_pre], dangles=dangles) bp_x_pre = bp_xs[0] bp_y_pre = bp_ys[0] else: bp_x_pre = [] bp_y_pre = [] # Add pre-sequence base pairings to total base pairings offset = 0 # Begins at 0 for (nt_x, nt_y) in zip(bp_x_pre, bp_y_pre): total_bp_x.append(nt_x + offset) total_bp_y.append(nt_y + offset) # Add r_rna-binding site base pairings to total base pairings offset = 0 # Begins at zero if startpos_to_end_len < self.__cutoff: r_rna_offset = startpos_to_end_len else: r_rna_offset = startpos_to_end_len for (nt_x, nt_y) in zip(bp_x_target, bp_y_target): total_bp_x.append(nt_x + offset) total_bp_y.append(nt_y + r_rna_offset) # Calculate post-sequence folding if len(m_rna_post) > 0: _, bp_xs, bp_ys = self.__runner.mfe([m_rna_post], dangles=dangles) bp_x_post = bp_xs[0] bp_y_post = bp_ys[0] else: bp_x_post = [] bp_y_post = [] offset = post_window_begin - begin for (nt_x, nt_y) in zip(bp_x_post, bp_y_post): total_bp_x.append(nt_x + offset) total_bp_y.append(nt_y + offset) m_rna_subseq = m_rna[begin:m_rna_len] total_energy = self.__runner.energy([m_rna_subseq, self.__r_rna], total_bp_x, total_bp_y, dangles=dangles) total_energy_withspacing = total_energy + dg_spacing_final return (total_energy_withspacing, m_rna_subseq, total_bp_x, total_bp_y, total_energy) def __calc_dg_spacing(self, aligned_spacing): '''Calculates the dG_spacing according to the value of the aligned spacing. This relationship was determined through experiments.''' d_s = aligned_spacing - self.__optimal_spacing if aligned_spacing < self.__optimal_spacing: dg_spacing_penalty = 12.2 / \ (1.0 + math.exp(2.5 * (d_s + 2.0))) ** 3.0 else: dg_spacing_penalty = 0.048 * d_s * d_s + 0.24 * d_s return dg_spacing_penalty def __calc_dg_standby_site(self, m_rna, bp_x, bp_y, energy_before, dangles): '''Calculates the dg of standby given the structure of the m_rna:r_rna complex.''' # To calculate the mfe structure while disallowing base pairing at the # standby site, we split the folded m_rna sequence into three parts: # (i) a pre-sequence (before the standby site) that can fold; (ii) the # standby site, which can not fold; (iii) the 16S r_rna binding site # and downstream sequence, which has been previously folded. standby_site_length = 4 # Identify the most 5p m_rna nt that is bound to r_rna for (nt_x, nt_y) in zip(bp_x, bp_y): # nt_x is m_rna, nt_y is r_rna, they are bound. if nt_x <= len(m_rna) and nt_y > len(m_rna): most_5p_m_rna = nt_x # starts counting from 0 break # Extract the base pairings that are 3' of the most_5p_m_rna base # pairing bp_x_3p = [] bp_y_3p = [] for (nt_x, nt_y) in zip(bp_x, bp_y): if nt_x >= most_5p_m_rna: bp_x_3p.append(nt_x) bp_y_3p.append(nt_y) # Create the m_rna subsequence m_rna_subsequence = m_rna[ 0:max(0, most_5p_m_rna - standby_site_length - 1)] # Fold it and extract the base pairings if len(m_rna_subsequence) > 0: _, bp_xs, bp_ys = self.__runner.mfe( [m_rna_subsequence], dangles=dangles) bp_x_5p = bp_xs[0] # [0] added 12/13/07 bp_y_5p = bp_ys[0] else: bp_x_5p = [] bp_y_5p = [] # Put the sets of base pairings together bp_x_after = [] bp_y_after = [] for (nt_x, nt_y) in zip(bp_x_5p, bp_y_5p): bp_x_after.append(nt_x) bp_y_after.append(nt_y) for (nt_x, nt_y) in zip(bp_x_3p, bp_y_3p): bp_x_after.append(nt_x) bp_y_after.append(nt_y) # Calculate its energy energy_after = self.__runner.energy([m_rna, self.__r_rna], bp_x_after, bp_y_after, dangles=dangles) d_g = energy_before - energy_after if d_g > 0.0: d_g = 0.0 return d_g def __get_random_rbs(self, shine_delgano, prob_shine_delgano, core_length, max_nonoptimal_spacing): '''Generates a random rbs sequence tailored towards the target translation initiation rate.''' pre_length = 25 rbs = [random.choice(seq_utils.NUCLEOTIDES) for _ in range(pre_length)] # Choose core_length nucleotides. # Choose from the SD sequence with probability prob_shine_delgano # Choose from non-SD sequence with probability # (1 - prob_shine_delgano) / 3 # The beginning/end of the core_length wrt to the SD sequence is # uniformly randomly determined. # core_length can't be greater then shine_delgano length: core_length = min(len(shine_delgano), core_length) diff = len(shine_delgano) - core_length begin = int(random.random() * diff) for i in range(core_length): if random.random() < prob_shine_delgano: rbs.append(shine_delgano[begin + i]) else: choices = list(seq_utils.NUCLEOTIDES) choices.remove(shine_delgano[begin + i]) rbs.append(random.choice(choices)) offset = diff - begin spacing = random.choice(range(max( 0, offset + self.__optimal_spacing - max_nonoptimal_spacing), offset + self.__optimal_spacing + max_nonoptimal_spacing)) rbs.extend([random.choice(seq_utils.NUCLEOTIDES) for _ in range(spacing)]) if len(rbs) > MAX_RBS_LENGTH: rbs = rbs[len(rbs) - MAX_RBS_LENGTH:len(rbs) + 1] return ''.join(rbs) def __calc_aligned_spacing(self, m_rna, start_pos, bp_x, bp_y): '''Calculates the aligned spacing between the 16S r_rna binding site and the start codon.''' # r_rna is the concatenated at the end of the sequence in 5' to 3' # direction first: identify the farthest 3' nt in the r_rna that binds # to the mRNA and return its mRNA base pairer seq_len = len(m_rna) + len(self.__r_rna) for r_rna_nt in range(seq_len, seq_len - len(self.__r_rna), -1): if r_rna_nt in bp_y: r_rna_pos = bp_y.index(r_rna_nt) if bp_x[r_rna_pos] < start_pos: farthest_3_prime_r_rna = r_rna_nt - len(m_rna) m_rna_nt = bp_x[r_rna_pos] # start_pos is counting starting from 0 (python) distance_to_start = start_pos - m_rna_nt + 1 return distance_to_start - farthest_3_prime_r_rna else: break return float('inf')