Ejemplo n.º 1
0
def GenomicSprinzlParser(infile,fix_sequence=False):
    """Parser for the Genomic tRNA Database.

    Assumes the file has been prepared by the following method:
    1. Set all search fields to empty.
    2. Check all the results fields.
    3. Perform the search (this takes a while).
    4. Save the results worksheet as tab-delimited text.

    Note that the alignment length is supposed to be 99 bases, but not all the
    sequences have been padded out with the correct number of hyphens.
    """
    num_blanks = 4
    first = True
    for l in infile:
        #skip blank lines
        line = l.rstrip()
        if not line:
            continue
        fields = line.split('\t')
        if first:   #label line
            #for unknown reasons, some of the field headers have '.' instead
            #of '0', e.g. '7.' instead of '70'.
            line = line.replace('.', '0')
            fields = line.split('\t')
            labels = InverseDict(enumerate(fields[num_blanks:]))
            first = False
            offset = 0
        else:       #expect 3 record lines at a time
            if offset == 0:     #label line
                info = dict(zip(GenomicFields, map(strip, fields)))
                #add in the labels
                info['Labels'] = labels
                #convert the taxonomy from a string to a list
                info['Taxonomy'] = map(strip, info['Taxonomy'].split(';'))
                #convert the anticodon into RNA
                info['Anticodon'] = Rna(info['Anticodon'])
                #get rid of the empty fields
                del info['']
            elif offset == 1:   #sequence line
                raw_seq = ''.join(map(strip, fields))
                #for some reason, there are underscores in some sequences
                raw_seq = raw_seq.replace('_', '-')
                if fix_sequence:
                    raw_seq = _fix_sequence(raw_seq)
                seq = Rna(raw_seq, Info=info)
            elif offset == 2:   #structure line
                seq.Pairing = _fix_structure(fields, seq)
                yield seq
            #figure out which type of line we're expecting next
            offset += 1
            if offset > 2:
                offset = 0
Ejemplo n.º 2
0
    def test_getSimilar(self):
        """Alignment getSimilar should get all sequences close to target seq"""
        aln = self.many
        x = Rna('gggggggggg')
        y = Rna('----------')
        #test min and max similarity ranges
        result = aln.getSimilar(aln['a'],min_similarity=0.4,max_similarity=0.7)
        for seq in 'cefg':
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 4)
        
        result = aln.getSimilar(aln['a'],min_similarity=0.95,max_similarity=1)
        for seq in 'a':
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 1)

        result = aln.getSimilar(aln['a'], min_similarity=0.75, \
            max_similarity=0.85)
        for seq in 'bd':
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 2)

        result = aln.getSimilar(aln['a'],min_similarity=0,max_similarity=0.2)
        self.assertEqual(len(result), 0)

        #test some sequence transformations
        transform = lambda s: s[1:4]
        result = aln.getSimilar(aln['a'], min_similarity=0.5, \
            transform=transform)
        for seq in 'abdfg':
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 5)

        transform = lambda s: s[-3:]
        result = aln.getSimilar(aln['a'], min_similarity=0.5, \
            transform=transform)
        for seq in 'abcde':
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 5)

        #test a different distance metric
        metric = lambda x, y: x.count('g') + y.count('g')
        result = aln.getSimilar(aln['a'], min_similarity=5, max_similarity=10, \
            metric=metric)
        for seq in 'ef':
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 2)

        #test the combination of a transform and a distance metric
        aln = Alignment(dict(enumerate(map(Rna, ['aA-ac','A-aAC','aa-aa']))))
        transform = lambda s: Rna(s.upper())
        metric = RnaSequence.fracSameNonGaps
        #first, do it without the transformation
        result = aln.getSimilar(aln[0], min_similarity=0.5, metric=metric)
        for seq in [0,2]:
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 2)
        
        result = aln.getSimilar(aln[0], min_similarity=0.8, metric=metric)
        for seq in [0]:
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 1)
        #then, verify that the transform changes the results         
        result = aln.getSimilar(aln[0], min_similarity=0.5, metric=metric, \
            transform=transform)
        for seq in [0,1,2]:
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 3)
        
        result = aln.getSimilar(aln[0], min_similarity=0.8, metric=metric, \
            transform=transform)
        for seq in [0,1]:
            assert seq in result
            assert result[seq] == aln[seq]
        self.assertEqual(len(result), 2)