def rnaplot(seq, struct=None, path='rnaplots', name='temp'): import RNA if struct==None: struct = RNA.fold(seq)[0] filename = os.path.join(path,name+'.ps') #RNA.svg_rna_plot(seq,struct,filename) colors = [" 1. 0. .2", " 0. .9 .5"] macro = format_cmark_values(range(0,10), rgb=colors[0]) RNA.PS_rna_plot_a(seq, struct, filename, '', macro) return filename
def local_search(start_seq_, target_structs_, seq_constraint_, context_front=None, context_back=None): global start_seq global seq_constraint global target_structs rna.check_struct_seq_match(target_structs_[0], start_seq_) rna.check_struct_seq_match(target_structs_[1], start_seq_) start_seq = start_seq_ target_structs = target_structs_ seq_constraint = seq_constraint_ # # TODO: has to be checked # preset_dangles = RNA.dangles # if preset_dangles != 0: # RNA.dangles = 1 if (SEARCH_STRATEGY == SearchStrategy.adaptive_walk or SEARCH_STRATEGY == SearchStrategy.stochastic_local_search): seq, cost, steps = local_search_sls_pf() elif SEARCH_STRATEGY == SearchStrategy.full_local_search: seq, cost, steps = local_search_fls_pf() else: raise ValueError("Specified search strategy not valid.") eval_seq_container.reset() vienna_rna.free_pf_arrays() vienna_rna.free_arrays() # RNA.dangles = preset_dangles return seq, cost, steps
def main(): """ for sequence string, calculate mfe structure, mfe, pf, base pair probability matrix, plot structures and bppms, calculate accessibilities""" print 'name mfe_low mfe_high pf_low pf_high RRS_acces_low RRS_acces_high AUG_acces_low AUG_acces_high' for seq_file in SeqIO.parse(sys.stdin, 'fasta'): sequ = str(seq_file.seq) fc_low = RNA.fold_compound(sequ, MODEL_LOW_TEMPERATURE) fc_high = RNA.fold_compound(sequ, MODEL_HIGH_TEMPERATURE) struct_low, mfe_low = fc_low.mfe() struct_high, mfe_high = fc_high.mfe() pfstruct_low, pf_low = fc_low.pf() pfstruct_high, pf_high = fc_high.pf() bppm_low = fc_low.bpp() bppm_high = fc_high.bpp() plot_2bppms(bppm_low, bppm_high, seq_file.id) RNA.PS_rna_plot(sequ, struct_low, '{:s}_low_ss.ps'.format(str2filename(seq_file.id))) RNA.PS_rna_plot(sequ, struct_high, '{:s}_high_ss.ps'.format(str2filename(seq_file.id))) constr1, constr2 = seqconstraints(sequ,RRS,START,SPACER) RRS_acces_low = accessibility(sequ,MODEL_LOW_TEMPERATURE,constr1,pf_low) RRS_acces_high = accessibility(sequ,MODEL_HIGH_TEMPERATURE,constr1,pf_high) AUG_acces_low = accessibility(sequ,MODEL_LOW_TEMPERATURE,constr2,pf_low) AUG_acces_high = accessibility(sequ,MODEL_HIGH_TEMPERATURE,constr2,pf_high) print seq_file.id, mfe_low, mfe_high, pf_low, pf_high, RRS_acces_low, RRS_acces_high, AUG_acces_low, AUG_acces_high print versions_used()
def getBPPM(sequence, structure = "", bppm_cutoff = 0.00001): """ Requires ViennaRNAtools Python module Returns the base pair probability matrix using Vienna pf_fold, get_pr and free_pf_arrays functions. returns upper triangular matrix, whose entries exceed a threshold """ bppm = {} #'--noPS', '-d 2', t, P if structure != "": RNA.cvar.fold_constrained = 1 else: RNA.cvar.fold_constrained = 0 #print "Before", structure RNA.pf_fold(sequence, structure) #print "After", structure seq_len = len(sequence)+1 for i in xrange(1, seq_len): for j in xrange(1, seq_len): if i<j: bpp = RNA.get_pr(i,j) if bpp > bppm_cutoff: bppm[str(i) + "_" + str(j)] = bpp else: bppm[str(i) + "_" + str(j)] = 0 RNA.free_pf_arrays() #print bppm #exit(1) return bppm
def mfe_bp_distance(S, G, masked=None): """This function takes an RNA sequence S, a secondary structure G, and returns de base pairs distance between the mfe structure (a mask for the folding can be provided as an optional argument) of S and G """ Sec_struct = RNA.fold(S)[0] return RNA.bp_distance(Sec_struct, G)
def prep_sec2(seq_five, seq_three, seq_apta, shift, rand): (take, dump) = RNA.fold(seq_five + "N" + "G" * 100 + "N" * 4 + "C" * 100 + "N" + seq_three) (take1, dump) = RNA.fold(seq_apta) seq = ( len(take.split("." + "(" * 100 + "." * 4 + ")" * 100 + ".", 1)[0]) * "." + "(" * rand + take1 + ")" * (rand) + "." * shift + len(take.split("." + "(" * 100 + "." * 4 + ")" * 100 + ".", 1)[1]) * "." ) return seq
def fold_probability(S, G=None): """Given a sequence S a secondary structure G (default mfe), we compute the partition function of S given G as a constraint. The output is a triple (A,B,C) where A is the annotated partition folding, B is the energie of the ensemble A, and C a dictionary having as keys a pair of positions and as value the probability of having the pair. """ struct, energy = RNA.pf_fold(S, G) #Compute the partition function dict_probabilities = {} for left, right in ((x,y) for x in range(len(S)) for y in range(len(S)) if x < y): dict_probabilities[left,right] =RNA.get_pr(left + 1,right +1) return (struct, energy, dict_probabilities)
def temperature_reactivity( sequence, structure, temperature1, temperature2 ): """Evaluate temperature-dependent difference in energy, entropy, enthalpy.""" temperature_model1 = RNA.md() temperature_model2 = RNA.md() temperature_model1.temperature = temperature1 temperature_model2.temperature = temperature2 fc1 = RNA.fold_compound(sequence, temperature_model1) fc2 = RNA.fold_compound(sequence, temperature_model2) energy_of_struct1 = fc1.eval_structure(structure) energy_of_struct2 = fc2.eval_structure(structure) # normalize delta_energy, add 0.001 to prevent division by 0 delta_energy = abs((energy_of_struct2 - energy_of_struct1) / (energy_of_struct2 + 0.001)) return (delta_energy, energy_of_struct1, energy_of_struct2)
def Rewards(self,k): #copy_unpairedposition=list(unpairedposition) #copy_bppused=list(bppused) if k > len(str_uindex)-1: posbasep=self.position[len(str_uindex):self.n] posbase=self.position[0:len(str_uindex)] e=list(itertools.chain(*posbasep)) for i in range(len(a)): posbase.insert(b[i],e[c[i]]) mutated_s= ''.join(map(str, posbase)) mutated_str1=RNA.fold(mutated_s) mutated_str=mutated_str1[0] d=0.0 g=0.0 n=len(s) for i in range(len(s)): if mutated_str[i]!=s[i]: d=d+1 g=(n-d)/n if g==1.0: solution.append(mutated_s) return g else: return g if k <= len(str_uindex)-1: posbasep=self.position[len(str_uindex):self.n] posbase=self.position[0:len(str_uindex)] e=list(itertools.chain(*posbasep)) for i in range(len(a)): posbase.insert(b[i],e[c[i]]) mutated_s= ''.join(map(str, posbase)) mutated_str1=RNA.fold(mutated_s) mutated_str=mutated_str1[0] d=0.0 g=0.0 n=len(s) for i in range(len(s)): if mutated_str[i]!=s[i]: d=d+1 g=(n-d)/n if g==1.0: solution.append(mutated_s) return g else: return g
def test_centroid(self): print "test_centroid\n" fc=RNA.fold_compound(align) fc.pf() (sc,dist) = fc.centroid() print sc,"\tDistance of : %6.2f" %dist ,"\n" self.assertTrue(sc and dist)
def test_eval_structure_pt(self): print "test_eval_structure_pt\n" fc=RNA.fold_compound(seq1) energy= fc.eval_structure_pt(struct1_pt) /100; #/100 for dcal self.assertEqual("%6.2f" % energy, "%6.2f" % -5.60) print struct1, "[%6.2f" % energy,"]\n"
def bt(i,j,k,l,d,data=None): """ The backtracking callback must return a list of base pairs Here, the base pairs may be given in one of the three ways shown below: """ if d == RNA.DECOMP_PAIR_HP: """ 1. We create a list of dictionaries with 'i' and 'j' keys that specify the coordinates of the base pair (i,j) """ bp = { 'i' : i+1, 'j' : j-1 } """ 2. We create a list of tuples (i,j) """ bp = (i+1, j-1) """ 3. We create a list of RNA::basepair objects """ bp = RNA.basepair() bp.i = i+1 bp.j = j-1 return [ bp ] return None
def __init__(self, fullseq, vrna_md): super(TrafoLandscape, self).__init__() self._full_sequence = fullseq self._model_details = vrna_md self._fold_compound = RNA.fold_compound(fullseq, vrna_md) # Adjust simulation parameters self._RT = 0.61632077549999997 if vrna_md.temperature != 37.0: kelvin = 273.15 + vrna_md.temperature self._RT = (self._RT/310.15) * kelvin # Private instance variables: self._transcript_length = 0 self._total_time = 0 self._nodeid = 0 # Default parameters: self._p_min = 0.01 # probability threshold self._fpath = 20 # findpath_search_width self._k0 = 2e5 # set directly self._dG_max = 0 # set using t_slow self._dG_min = 0 # set using t_fast
def main(): for monster in monsters: # calculate 1) foldcompound 2) partition function 3) base pair probability matrix in that order (!) foldmonster = RNA.fold_compound(str(monster)) pfstruct, pf = foldmonster.pf() bppm = foldmonster.bpp() plot_bppm(bppm, monster.id) print versions_used()
def score_match(query): motif = 'CCTCCT' length = len(motif) score = RNA.cofold(query + '&' + motif) return score[1]
def test_E_int_loop(self): print "test_E_int_loop" # "123456789012" seq1 = "AGACAAAAGACA" struct1=".(.(....).)." fc=RNA.fold_compound(seq1,None,RNA.OPTION_MFE) e = fc.E_int_loop(2,11) print seq1, " 2,7 = [ %6.2f" %e ,"] \n" self.assertEqual("%6.2f" %e,"%6.2f" % +80)
def test_pf(self): print "test_pf" fc= RNA.fold_compound(seq1) (ss,gfe) = fc.pf() print ss, "[ %6.2f" %gfe ,"]\n" self.assertTrue(ss) bp_dis = fc.mean_bp_distance() print seq1 ,"\t meanBPDistance : ", bp_dis,"\n" self.assertTrue(bp_dis)
def accessibility ( sequence, md, constr, pf_noconstr ): fc_constr = RNA.fold_compound(sequence, md) fc_constr.constraints_add(constr, RNA.CONSTRAINT_DB_DEFAULT) pf_constr_struct, pf_constr = fc_constr.pf() if re.search('x', constr): acces = exp((pf_noconstr - pf_constr)/(BOLTZMANN_K * (md.temperature + 273.15))) else: acces = 0 return acces
def prep_sec1(seq_five, seq_three, seq_apta, shift): (take, dump) = RNA.fold(seq_five + "N" + "G" * 100 + "N" * 4 + "C" * 100 + "N" + seq_three) seq = ( len(take.split("." + "(" * 100 + "." * 4 + ")" * 100 + ".", 1)[0]) * "." + "(" * shift + "." * (len(seq_apta)) + ")" * shift + len(take.split("." + "(" * 100 + "." * 4 + ")" * 100 + ".", 1)[1]) * "." ) return seq
def rnafold(seq, name=None): """Run RNAfold for precursor""" import RNA try: x = RNA.fold(seq) except Exception as e: print (e) return return x
def prep_sec2_comp(seq_five, seq_three, seq_apta, comp, rand): (take, dump) = RNA.fold(seq_five + "N" + "G" * 100 + "N" * 4 + "C" * 100 + "N" + seq_three) seq = ( (len(seq_five) - comp) * "." + "(" * (rand + comp) + "." * (len(seq_apta) - comp) + ")" * (rand + comp) + len(seq_three) * "." ) return seq
def test_eval_hp_loop(self): print "test_eval_hp_loop" seq1 = "GCAAAAGG" struct1= ".(....)." fc=RNA.fold_compound(seq1) #ehair = fc.eval_hp_loop(2,7) ehair = fc.E_hp_loop(2,7) print seq1, " 2,7 = [ %6.2f" %ehair ,"] \n" self.assertEqual("%6.2f" %ehair,"%6.2f" % +410)
def prep_sec2_left(seq_five, seq_three, seq_apta, shift, rand): (take, dump) = RNA.fold(seq_five + "N" + "C" * 100 + "N" * 4 + "G" * 100 + "N" + seq_three) seq = ( (len(seq_five) - shift) * "." + "(" * (rand + shift) + "." * (len(seq_apta) - shift) + ")" * (rand) + ")" * shift + len(seq_three) * "." ) return seq
def test_file_SHAPE_read(self): print "test_file_SHAPE_read" reactivities = getShapeDataFromFile("data/TPP_riboswitch_E.coli.shape_2rows") (a,b,c) = RNA.file_SHAPE_read("data/TPP_riboswitch_E.coli.shape_2rows", 79, -1) print "read file:" print a print b print c print reactivities print a
def test_eval_covar_structure(self): print "test_eval_covar_structure\n" s1="CCCCAAAACGGG" s2="CCCGAAAAGGGG" s3="CCCCAAAAGGGG" ali = [s1,s2,s3] covarStructure = "((((....))))" fc = RNA.fold_compound(ali) pseudoEScore=fc.eval_covar_structure2(covarStructure) print covarStructure, "[ %6.2f" %pseudoEScore ,"]\n" self.assertTrue(pseudoEScore)
def add(self, seq): def bp_pr(i, j): if i > j: i, j = j, i return bppm[iindx[i + 1] - (j + 1)] def pos_bp_pr_iter(): """Yield probability for base i being paired in each structure.""" struct_0, struct_1 = target_structs for i in xrange(len(seq)): a = struct_0.basepairs[i] b = struct_1.basepairs[i] if a is not None: if b is not None: yield bp_pr(i, a), bp_pr(i, b) else: sum_ = sum((bp_pr(i, j) for j in xrange(size))) yield bp_pr(i, a), 1 - sum_ elif b is not None: sum_ = sum((bp_pr(i, j) for j in xrange(size))) yield 1 - sum_, bp_pr(i, b) else: sum_ = 1 - sum((bp_pr(i, j) for j in xrange(size))) yield sum_, sum_ seq_str = str(seq) if seq_str in self.container: return size = len(seq) self.container[seq_str] = {} # with LOCK_VIENNA_RNA: energy_ensemble = vienna_rna.pf_fold_par(seq_str, None, None, 1, 0, 0) bppm = vienna_rna.doubleArray_frompointer(vienna_rna.export_bppm()) i = vienna_rna.get_iindx(len(seq_str)) iindx = vienna_rna.intArray_frompointer(i) energy_structs = ( vienna_rna.energy_of_struct(seq_str, str(target_structs[0])), vienna_rna.energy_of_struct(seq_str, str(target_structs[1]))) struct_mfe = " " * len(seq_str) energy_mfe = vienna_rna.fold_par(seq_str, struct_mfe, None, 0, 0) self.container[seq_str]["pos_bp_pr"] = list(pos_bp_pr_iter()) self.container[seq_str]["energy_ensemble"] = energy_ensemble self.container[seq_str]["energy_mfe"] = energy_mfe self.container[seq_str]["struct_mfe"] = struct_mfe self.container[seq_str]["seq_pr"] = ( math.exp((energy_ensemble - energy_structs[0]) / kT), math.exp((energy_ensemble - energy_structs[1]) / kT))
def full_hairpin(seq_five, seq_three, aptamer, shift): RNAbet = ["a", "c", "g", "u"] DNAbet = ["a", "c", "g", "t"] take, free_E = RNA.fold(aptamer) final_seq = seq_five for k in range(0, shift): final_seq += random.choice(DNAbet) final_seq += aptamer for k in range(0, shift): final_seq += random.choice(DNAbet) final_seq += seq_three active_seq = final_seq.split(aptamer)[0] + take.replace(".", "N") + final_seq.split(aptamer)[1] return final_seq, active_seq
def tenfold(fasta): """Fold tabbed FASTA input (gene;chr:start-stop \t sequence) and convert to binary output representing highest likelihood secondary structure 1 = double-stranded 0 = single-standed """ for line in open(fasta): label, sequence = line.strip().split("\t") dotplot, fe = RNA.fold(sequence) re1 = re.sub(r'\(|\)',r'1',dotplot) re10 = re.sub(r'\.',r'0',re1) print label + "\t" + re10
def main(argv): ArgumentDic = CmdParser(argv) if ArgumentDic["all"]: # necessary to allow the Traceback to finish without reaching the recursionlimit # only needed for calculating all possible structures resource.setrlimit(resource.RLIMIT_STACK, (2**29,-1)) sys.setrecursionlimit(10**6) s, m = InputParser(ArgumentDic["FastaFile"]) m = Nussinov(s, m) # this catches sequences with no secondary structure if m[0][len(s) - 1] == 0: print print " Your Sequence has no computable secondary structure." print else: if ArgumentDic["all"]: p, n = cTraceBack(s, m) else: p, n = TraceBack(s, m) print print " This sequence has a maximum of " + str(n) + " base pairs." print print s R = PosSecStruc(p, n) for res in R: basepairseq = "" for _ in range(len(s)): basepairseq +="." for pair in res: bps = basepairseq[:pair[0]] + "(" + basepairseq[pair[0] + 1: pair[1]] + ")" + basepairseq[pair[1] + 1:] basepairseq = bps print basepairseq print if ArgumentDic["graphic"] == True: RNA.gmlRNA(s, basepairseq, "Nussinov.gml", "A") g = igraph.read("Nussinov.gml") layout = g.layout("kk") igraph.plot(g, layout = layout)
def test_eval_structure_verbose(self): print "test_eval_structure_verbose" fc = RNA.fold_compound(seq1) filename= "test-RNA-mfe_eval.py.out" try: f = open(filename, "w") print filename ," is opened for writing\n" energy = fc.eval_structure_verbose(struct1,f) energy2 = fc.eval_structure_verbose(struct1,None) self.assertEqual("%6.2f" % energy, "%6.2f" % -5.60) print struct1, "[%6.2f" % energy,"]\n" except IOError: print "Could not open ",filename
#FoldingFreeEnergy '''Given a mRNA sequence, RNAFold can calculate the folding free energy of the mRNA secondary structure. Please donwload the library on your system from 'https://www.tbi.univie.ac.at/RNA/'. ''' import RNA mrna = 'ATGC' #input sequence. mfe = RNA.fold(str(mrna)) print(float('%6.2f' % mfe[1])) #Contain to two-place decimal.
def test_mfe_window(self): print "test_mfe_window" fc = RNA.fold_compound(seq1, None, RNA.OPTION_MFE | RNA.OPTION_WINDOW) (mfe) = fc.mfe_window() print "[ %6.2f ]" % mfe self.assertEqual("%6.2f" % mfe, "%6.2f" % -5.60)
sequence = "GAAGUGUGGCUGGCAAGGAGAAUUAUGUGUGAAAAUUUGUCGGUAGAUAGGCAGUGGUGGCGAAGGGAGGGGGAAACGAUUUUGCCUCCGACGUCCAUCAUCGCCAGACAGGACGGUCUCCCUUCCUACAGGUCUCUGGCACAUAUCCUC" s1 = "....(((.....))).(((....(((((((.(.....((((....))))(((.((((((((....(((((.(((......))).)))))...).))))))).)))((((((((.((....)).))))....)))).).)))))))))).." s2 = "((.(((((((.((.(((((((....(.(((.......((((....))))(((.(((((((((...(((((.(((......))).)))))..)).))))))).)))..))).)....)))).))))).....)))....))))...))..." sequence = "GGAUAUUUCUUGUUGGCGCUCGGGCCGUCACUCUCCUCCCAACGAAACCCCAGGAGAGACAUCACAUAAGCAAACCUUUUGAUUUGAUGUAACCGUGGAGAAAACAAGUUCCUGUUACUUGGACACGUCUUUAGAAAAAACAGGAACGGU" s1 = "......((((.((((((......)))....(((((((..............)))))))((((((((.............))...)))))))))...))))...((..(((((((((....(((....)))........))))))))).))" s2 = "........(((((((((......)))....(((((((..............)))))))((((((.((((.........)).)).)))))).....((((((...(((((.......)))))......)))))).....))))))......" sequence = "AACGGGUGGGUACUCCCUGGUAAAGCCCGAGUCGAGACAUUGUCAUAUGUAUGAGAUUCCUUUGUUGUUGGUCGGCUGGG" s1 = "..((((((((....))).......)))))((((((.....(.((((....)))).)...((........))))))))..." s2 = "...((((...((((....))))..)))).(((((((((....((..........)).......))).....))))))..." Debug = True # Debug = False sections = merge_check(sequence, s1, s2, Debug=Debug) print("input") print(sequence) print(s1) print(s2) fc = RNA.fold_compound(sequence) pt1 = RNA.ptable_from_string(s1) pt2 = RNA.ptable_from_string(s2) helper.print_tables(s1, s2, pt1, pt2) # res = merge_recursive.recursive_merge(sequence, s1, s2, sections=sections, search_width=500, Debug=False, Verbose=True, new=True, plot_graph=False) print(sections)
def test_subopt3(self): print "test_subopt_cb (as fold_compound method)\n" a = RNA.fold_compound(sequence) a.subopt_cb(500, print_subopt_result)
def merge_check(sequence, s1, s2, Debug=False): if Debug: coloredlogs.DEFAULT_LOG_FORMAT = '%(levelname)s %(message)s' coloredlogs.install(level='DEBUG') # logging.info("It works!") # logging.debug("This is a log message.") # print_d("This is a log message.", "test") # logging.error("this is an error message") ptables_s1 = RNA.ptable_from_string(s1) ptables_s2 = RNA.ptable_from_string(s2) ltables_s1 = RNA.loopidx_from_ptable(ptables_s1) ltables_s2 = RNA.loopidx_from_ptable(ptables_s2) def next_int_loops(min_pos, max_pos): ''' new function to find best interior loops with largest bp_dist ''' print_d("start next int loops", min_pos, max_pos) curr_lvl = 0 diff = 0 candidates = defaultdict(lambda: [float('inf'), float('inf'), 0, 0, 0]) c2 = defaultdict(list) c3 = [] for i, (p1, p2) in enumerate(zip(ptables_s1[1:], ptables_s2[1:])): # if i==0: continue if i < min_pos or i > max_pos: continue if p1 == 0 and p2 == 0: continue # if p1==p2 and p1 > i: # curr_lvl += 1 # elif p1==p2 and p1 < i: # curr_lvl -= 1 # check which compatible sections have the highest potential for recursion if p1 == p2 and i < p1: j = p1 # check compatibility last_i = i - 1 next_j = j + 1 curr_i = i + 1 next_i = i + 2 last_j = j - 1 # out of bounds if next_j > len(s1) or last_i < 0: continue # inner/outer section not compatible # if ptables_s1[last_i] != ptables_s2[last_i] or ptables_s1[next_j] != ptables_s2[next_j]: # print_d(last_i,i,j,next_j) # print_d("fail1",ptables_s1[last_i], ptables_s2[last_i] ) # print_d("fail2",ptables_s1[next_j], ptables_s2[next_j] ) # continue print_d(curr_i, next_i, last_j, j) # print_d("fail1",ptables_s1[curr_i], ptables_s2[curr_i] ) # print_d("fail2",ptables_s1[i], ptables_s2[i] ) if ptables_s1[curr_i] != ptables_s2[curr_i] or ptables_s1[ i] != ptables_s2[i]: print_d(curr_i, next_i, last_j, j) print_d("fail1", ptables_s1[curr_i], ptables_s2[curr_i]) print_d("fail2", ptables_s1[last_j], ptables_s2[last_j]) continue # debatable - extra ( & ) at end/start pos # if ptables_s1[last_i] == 0 or ptables_s1[next_j] == 0: # continue # if start and end are unpaired, both i and j have to be unpaired, # otherwise energies don't add up properly / various errors last_i = i if ptables_s1[last_i] == 0 and ptables_s2[last_i] != 0: continue if ptables_s1[next_j] == 0 and ptables_s2[next_j] != 0: continue # if ptables_s1[i+1] == 0 or ptables_s1[j] == 0: # continue# if ptables_s1[i] == 0 or ptables_s1[j + 1] == 0: continue # if ptables_s1[i] < ptables_s1[j+1]: # last i,j: )( instead of () if ptables_s1[ptables_s1[i]] != ptables_s1[ j + 1]: # last i,j: )( instead of () continue outer_s1 = s1[min_pos:i] + "." * (p1 - i) + s1[p1:max_pos + 1] outer_s2 = s2[min_pos:i] + "." * (p2 - i) + s2[p2:max_pos + 1] print_d(outer_s1, s1[i - 1], ptables_s1[i], s1[p1], ptables_s1[j + 1], "//", ptables_s1[ptables_s1[i]]) print_d(outer_s2) inner_s1 = s1[i:p1] inner_s2 = s2[i:p1] print_d(inner_s1) print_d(inner_s2) inner_size = p1 - i outer_size = max_pos - min_pos - inner_size # print (outer_s1) # print (outer_s2) # print (inner_s1) # print (inner_s2) bp_dist_inner = RNA.bp_distance(inner_s1, inner_s2) bp_dist_outer = RNA.bp_distance(outer_s1, outer_s2) # bp_dist = max(bp_dist_outer, bp_dist_inner) - min(bp_dist_outer, bp_dist_inner) # optimize = (inner_size/outer_size)*bp_dist_inner optimize = (inner_size / outer_size) optimize = abs(0.6 - (inner_size / outer_size)) # if the step is too small # if bp_dist_outer < 3: # continue # if min(bp_dist_outer, bp_dist_inner) < 10: # 300_min10 # continue if min(bp_dist_outer, bp_dist_inner) < 3: # 300_r continue # this is the standard.. ? # if min(bp_dist_outer, bp_dist_inner) < 1: # continue # overwrite with better candidate print_d("candidate", i, p1, p2, curr_lvl, diff, "inner size", inner_size, "outer size", outer_size, "opt:", optimize) c3.append((i, j, optimize, inner_size, outer_size, bp_dist_inner, bp_dist_outer)) """ recursion conditions: inner section > 20 bp outer / inner section in the region of 0.25 to 0.75 maximize inner section size which has at least a 3 bp distance to the outer section """ # print ("---") print_d("found candidates:") c3 = sorted(c3, key=lambda item: item[2], reverse=False) # highest opt. available = [0 for i in ptables_s1] for key in c3: print_d("c3", i, j) i = key[0] j = key[1] # all nucleotides between i and j need to be available if all(i == 0 for i in available[i + 1:j]): print_d('add', i, j) available[i:j] = [1] * (j - i) mode = 0 indices = [] for i in range(len(available)): if available[i] == 1: if mode == 0: start_pos = i mode = 1 else: if mode == 1: indices.append([start_pos, i]) mode = 0 print_d("final indices", indices) return indices def ignore_unpaired_nt(i, j): while i < j: # if ptables_s1[i] != 0 or ptables_s2[i] != 0: if ptables_s1[i] != 0 and ptables_s2[i] != 0: # i -= 1 return i i += 1 # case if no paired nt return j def ignore_non_aligned(i, j): while i < j: if ptables_s1[i] == ptables_s2[ i] and ptables_s1[i] != 0 and i > ptables_s1[i]: # i -= 1 return i i += 1 # case if no paired nt return j def bp_dist_section(i, j): # bp dist s1 to s2 for a given section return RNA.bp_distance(s1[i - 1:j], s2[i - 1:j]) def new_exterior_loops(i, j): if i == 0: i = 1 def exterior_loops_per_s(ptable, ltable, i, j): i_s1 = i j_s1 = j s1_list = [] while True: # print ("s", i_s1) # ignore unpair nt while i_s1 < j: if ptable[i_s1] != 0: break i_s1 += 1 if i_s1 >= j: break while j_s1 < j: if ptable[j_s1] != 0: break j_s1 -= 1 i_s1_start = i_s1 end_loop = ltable[i_s1] i_s1 += 1 while i_s1 < j: if ltable[i_s1] == end_loop and ptable[i_s1] != 0: break i_s1 += 1 # print (i_s1_start,j_s1, i_s1) s1_list.append((i_s1_start, i_s1)) # break i_s1 += 1 return s1_list s1_list = exterior_loops_per_s(ptables_s1, ltables_s1, i, j) s2_list = exterior_loops_per_s(ptables_s2, ltables_s2, i, j) # print (s1_list) # print (s2_list) i = 0 j = 0 p_min = None exterior_loops = [] # go over the list of exterior loops of s1 and s2 and see if # there are overlapping sections, which have to be merged here while (len(s1_list) != i and len(s2_list) != j): # print ("start", i, j, s1_list[i]) if p_min == None: p_min = min(s1_list[i][0], s2_list[j][0]) t1 = s1_list[i] t2 = s2_list[j] min_t = min(min(t1[0], t2[0]), p_min) max_t = max(t1[1], t2[1]) print_d('current min/max:', min_t, max_t) if i + 1 < len(s1_list): next_i_min = s1_list[i + 1][0] print_d('s1 list:', next_i_min, max_t) if next_i_min < max_t: i += 1 continue if j + 1 < len(s2_list): next_j_min = s2_list[j + 1][0] print_d('s2 list:', next_j_min, max_t) if next_j_min < max_t: j += 1 continue exterior_loops.append((min_t, max_t)) i += 1 j += 1 p_min = None # print ("end", i, j) # check here for )( / ). / .( # )( / .( / ). loop_id = 0 while len(exterior_loops) > 1 and loop_id + 1 < len(exterior_loops): loop = exterior_loops[loop_id][1] next_loop = exterior_loops[loop_id + 1][0] print_d("check", loop, next_loop) print_d(s1[loop - 1], s1[next_loop - 1]) print_d(s2[loop - 1], s2[next_loop - 1]) # check for compatibility if end and start for the next loop is next to each other # if loop+1 == next_loop and (s1[loop-1] != s2[loop-1] or s1[next_loop-1] != s2[next_loop-1]): # if loop+1 == next_loop and (ptables_s1[loop] != ptables_s2[loop] or ptables_s1[next_loop] != ptables_s2[next_loop]): if loop + 1 <= next_loop and ( ptables_s1[loop] != ptables_s2[loop] or ptables_s1[next_loop] != ptables_s2[next_loop]): print_d('merge', loop_id, loop_id + 1) exterior_loops[loop_id] = (min(exterior_loops[loop_id][0], exterior_loops[loop_id+1][0]),\ max(exterior_loops[loop_id][1], exterior_loops[loop_id+1][1]) ) exterior_loops.pop(loop_id + 1) loop_id = 0 # overlapping if loop >= next_loop: print_d('merge', loop_id, loop_id + 1) exterior_loops[loop_id] = (min(exterior_loops[loop_id][0], exterior_loops[loop_id+1][0]),\ max(exterior_loops[loop_id][1], exterior_loops[loop_id+1][1]) ) exterior_loops.pop(loop_id + 1) loop_id = 0 loop_id += 1 return exterior_loops def recursive_walk(i, j, r_depth=0): # collect paths at each recursion level sections = [] # if r_depth==0: # e = new_exterior_loops(i,j) # else: # # e = interior_loops(i,j) # e = next_int_loops(i-1,j-1) # print_d ("all ext/int loops:", e) e = next_int_loops(i, j) for loop in e: ext_i, ext_j = loop to_add = [ext_i] # if r_depth==0: # return_value = recursive_walk(ext_i, ext_j, r_depth=r_depth+1) # else: return_value = recursive_walk(ext_i, ext_j - 1, r_depth=r_depth + 1) to_add += return_value to_add += [ext_j] sections.append(to_add) if r_depth == 0: sections.insert(0, 0) # start with 0 sections.append(j) #j-1 # end with total length return sections return recursive_walk(0, len(s1))
def next_int_loops(min_pos, max_pos): ''' new function to find best interior loops with largest bp_dist ''' print_d("start next int loops", min_pos, max_pos) curr_lvl = 0 diff = 0 candidates = defaultdict(lambda: [float('inf'), float('inf'), 0, 0, 0]) c2 = defaultdict(list) c3 = [] for i, (p1, p2) in enumerate(zip(ptables_s1[1:], ptables_s2[1:])): # if i==0: continue if i < min_pos or i > max_pos: continue if p1 == 0 and p2 == 0: continue # if p1==p2 and p1 > i: # curr_lvl += 1 # elif p1==p2 and p1 < i: # curr_lvl -= 1 # check which compatible sections have the highest potential for recursion if p1 == p2 and i < p1: j = p1 # check compatibility last_i = i - 1 next_j = j + 1 curr_i = i + 1 next_i = i + 2 last_j = j - 1 # out of bounds if next_j > len(s1) or last_i < 0: continue # inner/outer section not compatible # if ptables_s1[last_i] != ptables_s2[last_i] or ptables_s1[next_j] != ptables_s2[next_j]: # print_d(last_i,i,j,next_j) # print_d("fail1",ptables_s1[last_i], ptables_s2[last_i] ) # print_d("fail2",ptables_s1[next_j], ptables_s2[next_j] ) # continue print_d(curr_i, next_i, last_j, j) # print_d("fail1",ptables_s1[curr_i], ptables_s2[curr_i] ) # print_d("fail2",ptables_s1[i], ptables_s2[i] ) if ptables_s1[curr_i] != ptables_s2[curr_i] or ptables_s1[ i] != ptables_s2[i]: print_d(curr_i, next_i, last_j, j) print_d("fail1", ptables_s1[curr_i], ptables_s2[curr_i]) print_d("fail2", ptables_s1[last_j], ptables_s2[last_j]) continue # debatable - extra ( & ) at end/start pos # if ptables_s1[last_i] == 0 or ptables_s1[next_j] == 0: # continue # if start and end are unpaired, both i and j have to be unpaired, # otherwise energies don't add up properly / various errors last_i = i if ptables_s1[last_i] == 0 and ptables_s2[last_i] != 0: continue if ptables_s1[next_j] == 0 and ptables_s2[next_j] != 0: continue # if ptables_s1[i+1] == 0 or ptables_s1[j] == 0: # continue# if ptables_s1[i] == 0 or ptables_s1[j + 1] == 0: continue # if ptables_s1[i] < ptables_s1[j+1]: # last i,j: )( instead of () if ptables_s1[ptables_s1[i]] != ptables_s1[ j + 1]: # last i,j: )( instead of () continue outer_s1 = s1[min_pos:i] + "." * (p1 - i) + s1[p1:max_pos + 1] outer_s2 = s2[min_pos:i] + "." * (p2 - i) + s2[p2:max_pos + 1] print_d(outer_s1, s1[i - 1], ptables_s1[i], s1[p1], ptables_s1[j + 1], "//", ptables_s1[ptables_s1[i]]) print_d(outer_s2) inner_s1 = s1[i:p1] inner_s2 = s2[i:p1] print_d(inner_s1) print_d(inner_s2) inner_size = p1 - i outer_size = max_pos - min_pos - inner_size # print (outer_s1) # print (outer_s2) # print (inner_s1) # print (inner_s2) bp_dist_inner = RNA.bp_distance(inner_s1, inner_s2) bp_dist_outer = RNA.bp_distance(outer_s1, outer_s2) # bp_dist = max(bp_dist_outer, bp_dist_inner) - min(bp_dist_outer, bp_dist_inner) # optimize = (inner_size/outer_size)*bp_dist_inner optimize = (inner_size / outer_size) optimize = abs(0.6 - (inner_size / outer_size)) # if the step is too small # if bp_dist_outer < 3: # continue # if min(bp_dist_outer, bp_dist_inner) < 10: # 300_min10 # continue if min(bp_dist_outer, bp_dist_inner) < 3: # 300_r continue # this is the standard.. ? # if min(bp_dist_outer, bp_dist_inner) < 1: # continue # overwrite with better candidate print_d("candidate", i, p1, p2, curr_lvl, diff, "inner size", inner_size, "outer size", outer_size, "opt:", optimize) c3.append((i, j, optimize, inner_size, outer_size, bp_dist_inner, bp_dist_outer)) """ recursion conditions: inner section > 20 bp outer / inner section in the region of 0.25 to 0.75 maximize inner section size which has at least a 3 bp distance to the outer section """ # print ("---") print_d("found candidates:") c3 = sorted(c3, key=lambda item: item[2], reverse=False) # highest opt. available = [0 for i in ptables_s1] for key in c3: print_d("c3", i, j) i = key[0] j = key[1] # all nucleotides between i and j need to be available if all(i == 0 for i in available[i + 1:j]): print_d('add', i, j) available[i:j] = [1] * (j - i) mode = 0 indices = [] for i in range(len(available)): if available[i] == 1: if mode == 0: start_pos = i mode = 1 else: if mode == 1: indices.append([start_pos, i]) mode = 0 print_d("final indices", indices) return indices
def expand_graph(CG, saddles, args, mode='default'): """ Find new neighbors and add them to the Conformation Graph The function is devided into two parts. 1) The current mfe structure is connected to all present structures, 2) The conformation graph is expanded using helix-breathing. :param CG: Conformation Graph (NetworkX) :param saddles: dictionary of all previous findpath runs :param args: commandline arguments and other global variables (using: cutoff, verbose) :param mode: choose from (1) mfe-only: only use current mfe as potential new neighbor (2) breathing-only: only use breathing neighborhood, (3) default: do both mfe and breathing. :return: Number of new nodes """ cutoff = args.occupancy_cutoff verb = args.verbose mfree = args.min_breathing csid = CG.graph['seqid'] fseq = CG.graph['full_sequence'] tlen = CG.graph['transcript_length'] seq = fseq[0:tlen] if mode not in ['default', 'mfe-only', 'breathing-only']: raise ValueError('unknown expansion mode') # Add MFE ss, mfe = RNA.fold(seq) future = '.' * (len(fseq) - tlen) ss = ss + future #print >> sys.stderr, "{}\n{} {:6.2f}".format(seq, ss, mfe) regular_mode = True # NOTE: HACK! this is only here to produce any possible graph # If there is no node bec we are in the beginning, add the node, # otherwise, go through all nodes and try to add transition edges if nx.number_of_nodes(CG) == 0: en = round(RNA.energy_of_structure(fseq, ss, 0), 2) CG.add_node(ss, energy=en, occupancy=1.0, identity=CG.graph['seqid'], active=True) CG.graph['seqid'] += 1 elif mode == 'default' or mode == 'mfe-only': for ni in CG.nodes(): if CG.node[ni]['active'] == False: continue if ni == ss or CG.has_edge(ni, ss): continue if CG.has_node(ss): # from a previous iteration if add_transition_edges(CG, saddles, args, ni, ss): CG.node[ss][ 'active'] = True # in case it was there but inactive elif add_transition_edges(CG, saddles, args, ni, ss): en = round(RNA.energy_of_structure(fseq, ss, 0), 2) CG.node[ss]['active'] = True CG.node[ss]['energy'] = en CG.node[ss]['occupancy'] = 0.0 CG.node[ss]['identity'] = CG.graph['seqid'] CG.graph['seqid'] += 1 if mode == 'default' or mode == 'breathing-only': """ do the helix breathing graph expansion """ for ni, data in CG.nodes_iter(data=True): if data['active'] == False: continue en = data['energy'] occ = data['occupancy'] if regular_mode and occ < cutoff: continue sss = ni[0:len(seq)] opened = open_breathing_helices(seq, sss, free=mfree) #print opened for onbr in opened: nbr = fold_exterior_loop(seq, onbr) future = '.' * (len(ni) - len(nbr)) nbr += future if ni == nbr or CG.has_edge(ni, nbr): continue if CG.has_node(nbr): if add_transition_edges(CG, saddles, args, ni, nbr): CG.node[nbr][ 'active'] = True # in case it was there but inactive elif add_transition_edges(CG, saddles, args, ni, nbr): enbr = round(RNA.energy_of_structure(fseq, nbr, 0), 2) CG.node[nbr]['energy'] = enbr CG.node[nbr]['active'] = True CG.node[nbr]['occupancy'] = 0.0 CG.node[nbr]['identity'] = CG.graph['seqid'] CG.graph['seqid'] += 1 else: """# WARNING: Could not add transition edge!""" if not CG.has_node(ss) or CG.node[ss]['active'] is False: print "# WARNING: ", ss, "[mfe secondary structure not connected]" return CG.graph['seqid'] - csid
""" representatives are the structures and their free energies. """ f = open(pathToFile,"w") f.write(" "+seq+"\n") for i in range(len(representatives)): f.write(" "+str(i+1)+" "+representatives[i][0]+" "+str(representatives[i][1])+"\n") f.close() if __name__ == "__main__": #inputfile = sys.argv[1] #records = readFasta(inputfile) #records = readRNAxplorerFile(inputfile) seq = "GGGAAUUAUUGUUCCCUGAGAGCGGUAGUUCUC" (mfe_struct, mfe) = RNA.fold(seq) print mfe print mfe_struct sss = mainloop(seq, mfe_struct, mfe, 1) sss = uniq_list(sss) #cluster secondary-structure-set clusters = generateClusters(sss) #extract cluster representatives (could be centroid or mfe; here we chose mfe) representatives = extractClusterRepresentatives(seq, clusters) #openchain = "." * len(seq) #energy = RNA.energy_of_struct(seq,openchain) #representatives.insert(0, (openchain,energy)) #sort according to the free energy and if it is equal, lexographically.
def bp_dist_section(i, j): # bp dist s1 to s2 for a given section return RNA.bp_distance(s1[i - 1:j], s2[i - 1:j])