def test_DnaSequence(self): """DnaSequence should behave as expected""" x = DnaSequence('tcag') #note: no longer preserves case self.assertEqual(x, 'TCAG') x = DnaSequence('aaa') + DnaSequence('ccc') #note: doesn't preserve case self.assertEqual(x, 'AAACCC') assert x.MolType is DNA self.assertRaises(AlphabetError, x.__add__, 'z') self.assertEqual(DnaSequence('TTTAc').rc(), 'GTAAA')
def test_single_constructor(self): """RdbParser should use constructors if supplied""" to_dna = lambda x, Info: DnaSequence(str(x).replace('U','T'), \ Info=Info) f = list(RdbParser(self.oneseq, to_dna)) self.assertEqual(len(f), 1) a = f[0] self.assertEqual(a, 'AGTCATCTAGATHCATHC') self.assertEqual(a.Info, Info({'Species':'H.Sapiens',\ 'OriginalSeq':'AGUCAUCUAGAUHCAUHC'})) def alternativeConstr(header_lines): info = Info() for line in header_lines: all = line.strip().split(':', 1) #strip out empty lines, lines without name, lines without colon if not all[0] or len(all) != 2: continue name = all[0].upper() value = all[1].strip().upper() info[name] = value return info f = list(RdbParser(self.oneseq, to_dna, alternativeConstr)) self.assertEqual(len(f), 1) a = f[0] self.assertEqual(a, 'AGTCATCTAGATHCATHC') exp_info = Info({'OriginalSeq':'AGUCAUCUAGAUHCAUHC',\ 'Refs':{}, 'SEQ':'H.SAPIENS'}) self.assertEqual(a.Info, Info({'OriginalSeq':'AGUCAUCUAGAUHCAUHC',\ 'Refs':{}, 'SEQ':'H.SAPIENS'}))
def test_SeqToProfile(self): """SequenceToProfile: should work with different parameter settings """ seq = DnaSequence("ATCGRYN-") #Only non-degenerate bases in the char order, all other #characters are ignored. In a sequence this means that #several positions will contain only zeros in the profile. exp = zeros([len(seq), 4], Float64) for x, y in zip(range(len(seq)), [2, 0, 1, 3]): exp[x, y] = 1 self.assertEqual(SeqToProfile(seq,char_order="TCAG",\ split_degenerates=False).Data.tolist(),exp.tolist()) #Same thing should work as well when the char order is not passed in exp = zeros([len(seq), 4], Float64) for x, y in zip(range(len(seq)), [2, 0, 1, 3]): exp[x, y] = 1 self.assertEqual(SeqToProfile(seq, split_degenerates=False)\ .Data.tolist(),exp.tolist()) #All symbols in the sequence are in the char order, no row #should contain only zeros. Degenerate symbols are not split. exp = zeros([len(seq), 8], Float64) for x, y in zip(range(len(seq)), [2, 0, 1, 3, 4, 5, 6, 7]): exp[x, y] = 1 self.assertEqual(SeqToProfile(seq,char_order="TCAGRYN-",\ split_degenerates=False).Data.tolist(), exp.tolist()) #splitting all degenerate symbols, having only non-degenerate symbols #in the character order (and -) exp = array([[0, 0, 1, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, .5, .5, 0], [.5, .5, 0, 0, 0], [.25, .25, .25, .25, 0], [0, 0, 0, 0, 1]]) self.assertEqual(SeqToProfile(seq,char_order="TCAG-",\ split_degenerates=True).Data.tolist(),exp.tolist()) #splitting degenerates, but having one of the degenerate #symbols in the character order. In that case the degenerate symbol #is not split. exp = array([[0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, .5, .5, 0, 0], [.5, .5, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]]) self.assertEqual(SeqToProfile(seq,char_order="TCAGN-",\ split_degenerates=True).Data.tolist(),exp.tolist())
def dnastrict(x, **kwargs): try: return DnaSequence(x, **kwargs) except Exception: raise RecordError, "Could not convert sequence"
def Dna(seq, *args, **kwargs): seq = seq.replace('u', 't') seq = seq.replace('U', 'T') d = DnaSequence(seq, *args, **kwargs) return d
def default_distance_function(first, second): first = DnaSequence(first) return first.fracDiff(second)