def GenomicSprinzlParser(infile,fix_sequence=False): """Parser for the Genomic tRNA Database. Assumes the file has been prepared by the following method: 1. Set all search fields to empty. 2. Check all the results fields. 3. Perform the search (this takes a while). 4. Save the results worksheet as tab-delimited text. Note that the alignment length is supposed to be 99 bases, but not all the sequences have been padded out with the correct number of hyphens. """ num_blanks = 4 first = True for l in infile: #skip blank lines line = l.rstrip() if not line: continue fields = line.split('\t') if first: #label line #for unknown reasons, some of the field headers have '.' instead #of '0', e.g. '7.' instead of '70'. line = line.replace('.', '0') fields = line.split('\t') labels = InverseDict(enumerate(fields[num_blanks:])) first = False offset = 0 else: #expect 3 record lines at a time if offset == 0: #label line info = dict(zip(GenomicFields, map(strip, fields))) #add in the labels info['Labels'] = labels #convert the taxonomy from a string to a list info['Taxonomy'] = map(strip, info['Taxonomy'].split(';')) #convert the anticodon into RNA info['Anticodon'] = Rna(info['Anticodon']) #get rid of the empty fields del info[''] elif offset == 1: #sequence line raw_seq = ''.join(map(strip, fields)) #for some reason, there are underscores in some sequences raw_seq = raw_seq.replace('_', '-') if fix_sequence: raw_seq = _fix_sequence(raw_seq) seq = Rna(raw_seq, Info=info) elif offset == 2: #structure line seq.Pairing = _fix_structure(fields, seq) yield seq #figure out which type of line we're expecting next offset += 1 if offset > 2: offset = 0
def test_getSimilar(self): """Alignment getSimilar should get all sequences close to target seq""" aln = self.many x = Rna('gggggggggg') y = Rna('----------') #test min and max similarity ranges result = aln.getSimilar(aln['a'],min_similarity=0.4,max_similarity=0.7) for seq in 'cefg': assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 4) result = aln.getSimilar(aln['a'],min_similarity=0.95,max_similarity=1) for seq in 'a': assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 1) result = aln.getSimilar(aln['a'], min_similarity=0.75, \ max_similarity=0.85) for seq in 'bd': assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 2) result = aln.getSimilar(aln['a'],min_similarity=0,max_similarity=0.2) self.assertEqual(len(result), 0) #test some sequence transformations transform = lambda s: s[1:4] result = aln.getSimilar(aln['a'], min_similarity=0.5, \ transform=transform) for seq in 'abdfg': assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 5) transform = lambda s: s[-3:] result = aln.getSimilar(aln['a'], min_similarity=0.5, \ transform=transform) for seq in 'abcde': assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 5) #test a different distance metric metric = lambda x, y: x.count('g') + y.count('g') result = aln.getSimilar(aln['a'], min_similarity=5, max_similarity=10, \ metric=metric) for seq in 'ef': assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 2) #test the combination of a transform and a distance metric aln = Alignment(dict(enumerate(map(Rna, ['aA-ac','A-aAC','aa-aa'])))) transform = lambda s: Rna(s.upper()) metric = RnaSequence.fracSameNonGaps #first, do it without the transformation result = aln.getSimilar(aln[0], min_similarity=0.5, metric=metric) for seq in [0,2]: assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 2) result = aln.getSimilar(aln[0], min_similarity=0.8, metric=metric) for seq in [0]: assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 1) #then, verify that the transform changes the results result = aln.getSimilar(aln[0], min_similarity=0.5, metric=metric, \ transform=transform) for seq in [0,1,2]: assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 3) result = aln.getSimilar(aln[0], min_similarity=0.8, metric=metric, \ transform=transform) for seq in [0,1]: assert seq in result assert result[seq] == aln[seq] self.assertEqual(len(result), 2)