def test_get_counts(self): """get_counts: should work with all parameters""" seq = RnaSequence('UCAG-NAUGU') seq2 = RnaSequence('UAAG-CACGC') p = Pairs([(1, 8), (2, 7)]) p2 = Pairs([ (1, 8), (2, 6), (3, 6), (4, 9), ]) exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\ 'FP_INCONS':0, 'FP_CONTRA':0, 'FP_COMP':0} self.assertEqual(get_counts(p, p2), exp) exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\ 'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1} self.assertEqual(get_counts(p, p2, split_fp=True), exp) seq = RnaSequence('UCAG-NACGU') exp = {'TP':1,'TN':7, 'FN':1,'FP':3,\ 'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1} self.assertEqual(get_counts(p, p2, split_fp=True,\ sequences=[seq], min_dist=2), exp) # check against compare_ct.pm exp = {'TP':4,'TN':266, 'FN':6,'FP':6,\ 'FP_INCONS':2, 'FP_CONTRA':2, 'FP_COMP':2} seq = 'agguugaaggggauccgauccacuccccggcuggucaaccu'.upper() self.assertEqual(get_counts(self.true, self.predicted, split_fp=True,\ sequences=[seq], min_dist=4), exp)
def test_get_all_pairs(self): """get_all_pairs: should return the number of possible pairs""" seq = RnaSequence('UCAG-NACGU') seq2 = RnaSequence('UAAG-CACGC') self.assertEqual(get_all_pairs([seq], min_dist=4), 6) self.assertEqual(get_all_pairs([seq2], min_dist=4), 4) # when given multiple sequences, should average over all of them self.assertEqual(get_all_pairs([seq, seq2], min_dist=4), 5) # different min distance self.assertEqual(get_all_pairs([seq], min_dist=2), 10) # error on invalid minimum distance self.assertRaises(ValueError, get_all_pairs, [seq], min_dist=-2)
def test_seq_ungapping(self): rna1 = RnaSequence('U-C-A-G-', Name='rna1') model1 = ModelSequence('U-C-A-G-', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(rna1, 'U-C-A-G-') self.assertEqual(rna1.degap(), 'UCAG') # check is produces the right string from the beginning self.assertEqual(str(model1), 'U-C-A-G-') self.assertEqual(model1._data, [0, 4, 1, 4, 2, 4, 3, 4]) # ModelSequence should maybe have the same degap method as normal Seq self.assertEqual(str(model1.degap()), 'UCAG')
def test_seq_ungapping(self): rna1 = RnaSequence('U-C-A-G-', Name='rna1') model1 = ModelSequence('U-C-A-G-', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(rna1, 'U-C-A-G-') self.assertEqual(rna1.degap(), 'UCAG') # check is produces the right string from the beginning self.assertEqual(str(model1), 'U-C-A-G-') self.assertEqual(model1._data, [0,4,1,4,2,4,3,4]) # ModelSequence should maybe have the same degap method as normal Seq self.assertEqual(str(model1.degap()), 'UCAG')
def test_subset_positions_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = self.aln.takePositions([0, 1, 5]) self.assertEqual(obs_sub_aln, sub_aln) self.assertNotEqual(obs_sub_aln, self.aln) # string representations should be the same. This fails right # now, because sequence order is not maintained. See separate test. self.assertEqual(str(obs_sub_aln), str(sub_aln))
def test_subset_seqs_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna2, rna3], MolType=RNA) aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = aln.takeSeqs(['rna2', 'rna3']) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.takeSeqs(['rna3', 'rna2']) obs_sub_aln_2 = self.aln.takeSeqs(['rna2', 'rna3']) self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
def setUp(self): """setUp: set up method for all tests""" self.rna1 = RnaSequence('UCAG-RYN-N', Name='rna1') self.m1 = ModelSequence('UCAG-RYN-N', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.s1 = 'UCAG-RYN-N'
def test_extract_seqs(self): """extract_seqs: should handle different input formats""" s1 = ">seq1\nACGUAGC\n>seq2\nGGUAGCG" s2 = [">seq1", "ACGUAGC", ">seq2", "GGUAGCG"] s3 = ['ACGUAGC', 'GGUAGCG'] s4 = [RnaSequence('ACGUAGC'), RnaSequence('GGUAGCG')] m1 = ModelSequence('ACGUAGC', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) m2 = ModelSequence('GGUAGCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) s5 = [m1, m2] f = extract_seqs self.assertEqual(f(s1), ['ACGUAGC', 'GGUAGCG']) self.assertEqual(f(s2), ['ACGUAGC', 'GGUAGCG']) self.assertEqual(f(s3), ['ACGUAGC', 'GGUAGCG']) self.assertEqual(f(s4), ['ACGUAGC', 'GGUAGCG']) self.assertEqual(f(s5), ['ACGUAGC', 'GGUAGCG'])
def test_AlignmentToProfile_ignore(self): """AlignmentToProfile: should raise an error if too many chars ignored """ #Same conditions as previous function, but in the last column #there are only gaps, so normalization will fail at that position a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAA-')}) exp =\ array([[.5,0,.5,0], [0,1,0,0], [.5,0,.5,0], [0,0,0,1], [0,0,.75,.25], [.25,.25,.5,0], [.125,.125,.625,.125], [0,0,1,0]]) self.assertRaises(ValueError,AlnToProfile,a,alphabet=RNA,\ split_degenerates=True)
def test_gapped_to_ungapped_simple(self): """gapped_to_ungapped: should work for simple case""" s = RnaSequence(self.gapped) p = self.simple_g obs_seq, obs_pairs = gapped_to_ungapped(s, p) self.assertEqual(obs_seq, self.ungapped) self.assertEqualItems(obs_pairs, self.simple) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs)
def parse_residues(residue_lines, num_base, unpaired_symbol): """Return RnaSequence and Pairs object from residue lines. residue_lines -- list of lines or anything that behaves like it. Lines should contain: residue_position, residue_identiy, residue_partner. num_base -- int, basis of the residue numbering. In bpseq files from the CRW website, the numbering starts at 1. unpaired_symbol -- string, symbol in the 'partner' column that indicates that a base is unpaired. In bpseq files from the CRW website, the unpaired_symbol is '0'. This parameter should be a string to allow other symbols that can't be casted to an integer to indicate unpaired bases. Checks for double entries both in the sequence and the structure, and checks that the structre is valid in the sense that if (up,down) in there, that (down,up) is the same. """ #create dictionary/list for sequence and structure seq_dict = {} pairs = Pairs() for line in residue_lines: try: pos, res, partner = line.strip().split() if partner == unpaired_symbol: # adjust pos, not partner pos = int(pos) - num_base partner = None else: # adjust pos and partner pos = int(pos) - num_base partner = int(partner) - num_base pairs.append((pos, partner)) #fill seq_dict if pos in seq_dict: raise BpseqParseError(\ "Double entry for residue %s (%s in bpseq file)"\ %(str(pos), str(pos+1))) else: seq_dict[pos] = res except ValueError: raise BpseqParseError("Failed to parse line: %s" % (line)) #check for conflicts, remove unpaired bases if pairs.hasConflicts(): raise BpseqParseError("Conflicts in the list of basepairs") pairs = pairs.directed() pairs.sort() # construct sequence from seq_dict seq = RnaSequence(construct_sequence(seq_dict)) return seq, pairs
def test_gapped_to_ungapped_out_of_order(self): """gapped_to_ungapped: should work when pairs are out of order """ s = RnaSequence(self.gapped) p = Pairs(self.out_order_g) obs_seq, obs_pairs = gapped_to_ungapped(s, p) self.assertEqual(obs_seq, self.ungapped) self.assertEqualItems(obs_pairs, self.out_order) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs)
def test_gapped_to_ungapped_duplicates(self): """gapped_to_ungapped: should work when pairs contains duplicates """ s = RnaSequence(self.gapped) p = Pairs(self.duplicates_g) obs_seq, obs_pairs = gapped_to_ungapped(s, p) self.assertEqual(obs_seq, self.ungapped) self.assertEqualItems(obs_pairs, self.duplicates) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs)
def test_gapped_to_ungapped_pseudo(self): """gapped_to_ungapped: shouldn't care about pseudoknots """ s = RnaSequence(self.gapped) p = Pairs(self.pseudo_g) obs_seq, obs_pairs = gapped_to_ungapped(s, p) self.assertEqual(obs_seq, self.ungapped) self.assertEqualItems(obs_pairs, self.pseudo) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs)
def test_gapped_to_ungapped_no_gaps(self): """gapped_to_ungapped: should return same pairs when no gaps """ s = RnaSequence(self.ungapped) p = Pairs(self.simple) obs_seq, obs_pairs = gapped_to_ungapped(s, p) self.assertEqual(obs_seq, self.ungapped) self.assertEqualItems(obs_pairs, self.simple) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs)
def test_ungapped_to_gapped(self): """ungapped_to_gapped: should work for basic case """ s = RnaSequence(self.gapped) p = self.simple obs_seq, obs_pairs = ungapped_to_gapped(s, p) assert obs_seq is s self.assertEqualItems(obs_pairs, self.simple_g) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs)
def test_ungapped_to_gapped_out_of_order(self): """ungapped_to_gapped: should work when pairs out of order """ s = RnaSequence(self.gapped) p = self.out_order obs_seq, obs_pairs = ungapped_to_gapped(s, p) assert obs_seq is s self.assertEqualItems(obs_pairs, self.out_order_g) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs)
def test_adjust_pairs_from_mapping(self): """adjust_pairs_from_mapping: should work both ways """ #ungapped to gapped r = RnaSequence('UC-AG-UC-CG-A-') u_to_g = r.gapMaps()[0] #{0: 0, 1: 1, 2: 3, 3: 4, 4: 6, 5: 7, 6: 9, 7: 10, 8: 12} ungapped_pairs = Pairs([(0,8),(1,6),(2,5)]) exp_pairs = Pairs([(0,12),(1,9),(3,7)]) self.assertEqualItems(adjust_pairs_from_mapping(ungapped_pairs,\ u_to_g), exp_pairs) #gapped to ungapped r = RnaSequence('UC-AG-UC-CG-A-') g_to_u = r.gapMaps()[1] #{0: 0, 1: 1, 3: 2, 4: 3, 6: 4, 7: 5, 9: 6, 10: 7, 12: 8} gapped_pairs = Pairs([(0,12),(1,9),(3,7)]) exp_pairs = Pairs([(0,8),(1,6),(2,5)]) self.assertEqualItems(adjust_pairs_from_mapping(gapped_pairs,\ g_to_u), exp_pairs)
def test_AlignmentToProfile_basic(self): """AlignmentToProfile: should work under basic conditions """ #sequences in the alignment are unweighted #Alphabet is the alphabet of the sequences (RNA) #CharOrder is set explicitly #Degenerate bases are split up #Gaps are ignored #In all of the columns at least one character is in the CharOrder a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAAA')}) exp =\ array([[.5,0,.5,0], [0,1,0,0], [.5,0,.5,0], [0,0,0,1], [0,0,.75,.25], [.25,.25,.5,0], [.125,.125,.625,.125], [0,0,1,0]]) self.assertEqual(AlnToProfile(a,alphabet=RNA,\ split_degenerates=True).Data.tolist(),exp.tolist())
def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence('UCAGGG', Name='rna1') self.rna2 = RnaSequence('YCU-RG', Name='rna2') self.rna3 = RnaSequence('CAA-NR', Name='rna3') self.model1 = ModelSequence('UCAGGG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.model2 = ModelSequence('YCU-RG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) self.model3 = ModelSequence('CAA-NR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA) self.da = DenseAlignment([self.model1, self.model2, self.model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # seqs no name self.nn_rna1 = RnaSequence('UCAGGG') self.nn_rna2 = RnaSequence('YCU-RG') self.nn_rna3 = RnaSequence('CAA-NR') self.nn_model1 = ModelSequence('UCAGGG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model2 = ModelSequence('YCU-RG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model3 = ModelSequence('CAA-NR',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\ MolType=RNA) self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\ self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
def test_full(self): """RdbParser: full data, valid and invalid""" # when only good record, should work independent of strict r1 = RnaSequence("-??GG-UGAA--CGCU---ACGU-N???---",\ Info=Info({'Species': "unidentified Thermus OPB AF027020",\ 'Refs':{'rRNA':['AF027020']},\ 'OriginalSeq':'-o[oGG-U{G}AA--C^GC]U---ACGU-Nooo---'})) r2 = RnaSequence("---CGAUCG--UAUACG-N???-",\ Info=Info({'Species':'Thermus silvanus X84211',\ 'Refs':{'rRNA':['X84211']},\ 'OriginalSeq':'---CGAU[C(G){--UA}U]ACG-Nooo-'})) obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=True)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].Info, r1.Info) self.assertEqual(obs[1], r2) self.assertEqual(str(obs[1]), str(r2)) self.assertEqual(obs[1].Info, r2.Info) obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=False)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].Info, r1.Info) # when strict, should raise error on invalid record f = RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=True) self.assertRaises(RecordError, list, f) # when not strict, malicious record is skipped obs = list(RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=False)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].Info, r1.Info) self.assertEqual(obs[1], r2) self.assertEqual(str(obs[1]), str(r2)) self.assertEqual(obs[1].Info, r2.Info)
def test_ungapped_to_gapped_general(self): """ungapped_to_gapped: should return object of right type """ s = RnaSequence(self.gapped) p = self.simple #in case of RnaSequence obs_seq, obs_pairs = ungapped_to_gapped(s, p) assert obs_seq is s self.assertEqualItems(obs_pairs, self.simple_g) assert isinstance(obs_seq, RnaSequence) assert isinstance(obs_pairs, Pairs) #in case of str input s = self.gapped obs_seq, obs_pairs = ungapped_to_gapped(s, p) assert obs_seq is s self.assertEqualItems(obs_pairs, self.simple_g) assert not isinstance(obs_seq, RnaSequence) assert isinstance(obs_seq, str) assert isinstance(obs_pairs, Pairs)
def test_adjust_pairs_from_mapping(self): """adjust_pairs_from_mapping: should work both ways """ #ungapped to gapped r = RnaSequence('UC-AG-UC-CG-A-') u_to_g = r.gapMaps()[0] #{0: 0, 1: 1, 2: 3, 3: 4, 4: 6, 5: 7, 6: 9, 7: 10, 8: 12} ungapped_pairs = Pairs([(0, 8), (1, 6), (2, 5)]) exp_pairs = Pairs([(0, 12), (1, 9), (3, 7)]) self.assertEqualItems(adjust_pairs_from_mapping(ungapped_pairs,\ u_to_g), exp_pairs) #gapped to ungapped r = RnaSequence('UC-AG-UC-CG-A-') g_to_u = r.gapMaps()[1] #{0: 0, 1: 1, 3: 2, 4: 3, 6: 4, 7: 5, 9: 6, 10: 7, 12: 8} gapped_pairs = Pairs([(0, 12), (1, 9), (3, 7)]) exp_pairs = Pairs([(0, 8), (1, 6), (2, 5)]) self.assertEqualItems(adjust_pairs_from_mapping(gapped_pairs,\ g_to_u), exp_pairs)
def rna_distance(first, second): first = RnaSequence(first) return first.fracDiff(second)
def rna_distance(first,second): first = RnaSequence(first) return first.fracDiff(second)
def Rna(x, Info=None): if isinstance(x, list): x = ''.join(x) if Info is None: Info = {} return RnaSequence(x.upper().replace('T','U'), Info=InfoClass(Info))
def test_fromPair(self): """Counts fromPair should return correct counts.""" s = Counts.fromPair( RnaSequence('UCCGAUCGAUUAUCGGGUACGUA'), \ RnaSequence('GUCGAGUAUAGCGUACGGCUACG'), RnaPairs) assert isinstance(s, Counts) vals = [ ('U', 'U', 0), ('U', 'C', 2.5), ('U', 'A', 1), ('U', 'G', 2.5), ('C', 'U', 2.5), ('C', 'C', 1), ('C', 'A', 1), ('C', 'G', 0.5), ('A', 'U', 1), ('A', 'C', 1), ('A', 'A', 1), ('A', 'G', 2), ('G', 'U', 2.5), ('G', 'C', 0.5), ('G', 'A', 2), ('G', 'G', 2), ] for i, j, val in vals: self.assertFloatEqual(s[i, j], val) #check that it works for big seqs s = Counts.fromPair( RnaSequence('UCAG'*1000), \ RnaSequence('UGAG'*1000), RnaPairs) assert isinstance(s, Counts) vals = [ ('U', 'U', 1000), ('U', 'C', 0), ('U', 'A', 0), ('U', 'G', 0), ('C', 'U', 0), ('C', 'C', 0), ('C', 'A', 0), ('C', 'G', 500), ('A', 'U', 0), ('A', 'C', 0), ('A', 'A', 1000), ('A', 'G', 0), ('G', 'U', 0), ('G', 'C', 500), ('G', 'A', 0), ('G', 'G', 1000), ] for i, j, val in vals: self.assertFloatEqual(s[i, j], val) #check that it works for codon seqs s1 = ModelRnaCodonSequence('UUCGCG') s2 = ModelRnaCodonSequence('UUUGGG') c = Counts.fromPair(s1, s2, RNA.Alphabet.Triples**2) self.assertEqual(c._data.sum(), 2) self.assertEqual(c._data[0, 1], 0.5) self.assertEqual(c._data[1, 0], 0.5) self.assertEqual(c._data[55, 63], 0.5) self.assertEqual(c._data[63, 55], 0.5)