예제 #1
0
 def test_toLogOddsMatrix(self):
     """toLogOddsMatrix: should work as expected"""
     # This test can be short, because it mainly depends on toOddsMatrix
     # for which everything has been tested
     p = Profile(
         array(
             [
                 [0.1, 0.3, 0.5, 0.1],
                 [0.25, 0.25, 0.25, 0.25],
                 [0.05, 0.8, 0.05, 0.1],
                 [0.7, 0.1, 0.1, 0.1],
                 [0.6, 0.15, 0.05, 0.2],
             ]
         ),
         Alphabet="ACTG",
     )
     p_exp = Profile(
         array(
             [
                 [-1.322, 0.263, 1.0, -1.322],
                 [0.0, 0.0, 0.0, 0.0],
                 [-2.322, 1.678, -2.322, -1.322],
                 [1.485, -1.322, -1.322, -1.322],
                 [1.263, -0.737, -2.322, -0.322],
             ]
         ),
         Alphabet="ACTG",
     )
     self.assertFloatEqual(p.toLogOddsMatrix().Data, p_exp.Data, eps=1e-3)
     # works on empty matrix
     self.assertEqual(self.empty.toLogOddsMatrix().Data.tolist(), [[]])
예제 #2
0
 def test__add_(self):
     """__add__: should not normalize input or output, just add"""
     p1 = Profile(array([[.3,.4,.1,0],[.1,.1,.1,.7]]),Alphabet="ABCD")
     p2 = Profile(array([[1,0,0,0],[1,0,0,1]]),Alphabet="ABCD")
     self.assertEqual((p1+p2).Data, array([[1.3,.4,.1,0],[1.1,.1,.1,1.7]]))
     self.assertRaises(ProfileError,self.empty.__add__, p1)
     self.assertEqual((self.empty + self.empty).Data.tolist(),[[]])
예제 #3
0
    def test_randomSequence(self):
        """randomSequence: 99% of new frequencies should be within 3*SD"""
        r_num, c_num = 100, 20
        num_elements = r_num * c_num
        alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        r = random([r_num, c_num])
        p = Profile(r, alpha[:c_num])
        p.normalizePositions()
        d = p.Data
        n = 1000

        # Test only works on normalized profile, b/c of 1-d below
        means = n * d
        three_stds = sqrt(d * (1 - d) * n) * 3

        a = Alignment([p.randomSequence() for x in range(n)])

        def absoluteProfile(alignment, char_order):
            f = a.columnFreqs()
            res = zeros([len(f), len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    col = char_order.index(i)
                    res[row, col] = freq[i]
            return res

        ap = absoluteProfile(a, p.CharOrder)
        failure = abs(ap - means) > three_stds
        assert sum(sum(failure)) / num_elements <= 0.01
예제 #4
0
    def test_toOddsMatrix(self):
        """toOddsMatrix: should work on valid data or raise an error
        """
        p = Profile(
            array(
                [
                    [0.1, 0.3, 0.5, 0.1],
                    [0.25, 0.25, 0.25, 0.25],
                    [0.05, 0.8, 0.05, 0.1],
                    [0.7, 0.1, 0.1, 0.1],
                    [0.6, 0.15, 0.05, 0.2],
                ]
            ),
            Alphabet="ACTG",
        )
        p_exp = Profile(
            array([[0.4, 1.2, 2, 0.4], [1, 1, 1, 1], [0.2, 3.2, 0.2, 0.4], [2.8, 0.4, 0.4, 0.4], [2.4, 0.6, 0.2, 0.8]]),
            Alphabet="ACTG",
        )
        self.assertEqual(p.toOddsMatrix().Data, p_exp.Data)
        assert p.Alphabet is p.toOddsMatrix().Alphabet
        self.assertEqual(p.toOddsMatrix([0.25, 0.25, 0.25, 0.25]).Data, p_exp.Data)

        # fails if symbol_freqs has wrong size
        self.assertRaises(ProfileError, p.toOddsMatrix, [0.25, 0.25, 0.25, 0.25, 0.25, 0.25])
        self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix, [0.1, 0.2, 0.3])
        # works on empty profile
        self.assertEqual(self.empty.toOddsMatrix().Data.tolist(), [[]])
        # works with different input
        self.assertEqual(self.zero_entry.toOddsMatrix().Data, array([[1.2, 0.8, 0, 2], [0, 0, 3.2, 0.8]]))
        self.assertFloatEqual(
            self.zero_entry.toOddsMatrix([0.1, 0.2, 0.3, 0.4]).Data, array([[3, 1, 0, 1.25], [0, 0, 2.667, 0.5]]), 1e-3
        )
        # fails when one of the background frequencies is 0
        self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix, [0.1, 0.2, 0.3, 0])
예제 #5
0
    def test_randomIndices(self):
        """randomIndices: 99% of new frequencies should be within 3*SD
        """
        r_num, c_num = 100, 20
        num_elements = r_num * c_num
        r = random([r_num, c_num])
        p = Profile(r, "A" * c_num)
        p.normalizePositions()
        d = p.Data
        n = 1000

        # Test only works on normalized profile, b/c of 1-d below
        means = n * d
        three_stds = sqrt(d * (1 - d) * n) * 3
        result = [p.randomIndices() for x in range(n)]
        a = Alignment(transpose(result))

        def absoluteProfile(alignment, char_order):
            f = a.columnFreqs()
            res = zeros([len(f), len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    res[row, ord(i)] = freq[i]
            return res

        ap = absoluteProfile(a, p.CharOrder)
        failure = abs(ap - means) > three_stds
        assert sum(sum(failure)) / num_elements <= 0.01
예제 #6
0
    def test_score_profile(self):
        """score: should work correctly for Profile as input
        """
        p1 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,.5,.5],[0,0,0,1],\
            [.25,.25,.25,.25]]),"TCAG")
        p2 = Profile(array([[0,1,0,0],[.2,0,.8,0],[0,0,.5,.5],[1/3,1/3,0,1/3],\
            [.25,.25,.25,.25]]),"TCAG")
        p3 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,0,1]]),"TCAG")
        p4 = Profile(array([[1,0,0,0],[0,1,0,0]]),"TCAG")
        p5 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,0,1]]),"AGTC")

        #works on normal valid data
        self.assertFloatEqual(self.score2.score(p1,offset=0),\
            [.55,1.25,.45])
        self.assertFloatEqual(self.score2.score(p2,offset=0),
            [1.49,1.043,.483],1e-3)
        #works with different offset
        self.assertFloatEqual(self.score2.score(p1,offset=1),
            [1.25,0.45])
        self.assertFloatEqual(self.score2.score(p1,offset=2),
            [0.45])
        #raises error on invalid offset 
        self.assertRaises(ProfileError,self.score2.score,\
            p1,offset=3)
        #works on profile of minimal length
        self.assertFloatEqual(self.score2.score(p3,offset=0),
            [0.6])
        #raises error when profile is too short
        self.assertRaises(ProfileError, self.score2.score,p4,offset=0)
        #raises error on empty profile
        self.assertRaises(ProfileError,self.empty.score,p1)
        #raises error when character order doesn't match
        self.assertRaises(ProfileError,self.score2.score,p5) 
예제 #7
0
 def test__sub_(self):
     """__sub__: should subtract two profiles, no normalization"""
     p1 = Profile(array([[.3, .4, .1, 0], [.1, .1, .1, .7]]),
                  Alphabet="ABCD")
     p2 = Profile(array([[1, 0, 0, 0], [1, 0, 0, 1]]), Alphabet="ABCD")
     self.assertFloatEqual((p1-p2).Data, array([[-.7,.4,.1,0],\
         [-.9,.1,.1,-.3]]))
예제 #8
0
    def test_randomSequence(self):
        """randomSequence: 99% of new frequencies should be within 3*SD"""
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        r = random([r_num,c_num])
        p = Profile(r,alpha[:c_num])
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3

        a = Alignment([p.randomSequence() for x in range(n)])

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    col = char_order.index(i)
                    res[row, col] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
예제 #9
0
    def test_randomIndices(self):
        """randomIndices: 99% of new frequencies should be within 3*SD
        """
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        r = random([r_num,c_num])
        p = Profile(r,"A"*c_num)
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3
        result = [p.randomIndices() for x in range(n)]
        a = Alignment(transpose(result))

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    res[row, ord(i)] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
예제 #10
0
 def test_dataAt(self):
     """dataAt: should work on valid position and character"""
     p = Profile(array([[0.2, 0.4, 0.4, 0], [0.1, 0, 0.9, 0], [0.1, 0.2, 0.3, 0.4]]), Alphabet="TCAG")
     self.assertEqual(p.dataAt(0, "C"), 0.4)
     self.assertEqual(p.dataAt(1, "T"), 0.1)
     self.assertRaises(ProfileError, p.dataAt, 1, "U")
     self.assertRaises(ProfileError, p.dataAt, -2, "T")
     self.assertRaises(ProfileError, p.dataAt, 5, "T")
예제 #11
0
 def test_dataAt(self):
     """dataAt: should work on valid position and character"""
     p = Profile(array([[.2,.4,.4,0],[.1,0,.9,0],[.1,.2,.3,.4]]),\
         Alphabet="TCAG")
     self.assertEqual(p.dataAt(0,'C'),.4)
     self.assertEqual(p.dataAt(1,'T'),.1)
     self.assertRaises(ProfileError, p.dataAt, 1, 'U')
     self.assertRaises(ProfileError, p.dataAt, -2, 'T')
     self.assertRaises(ProfileError, p.dataAt, 5, 'T')
예제 #12
0
 def test_score_no_trans_table(self):
     """score: should work when no translation table is present
     """
     p = Profile(Data=array([[-1, 0, 1, 2], [-2, 2, 0, 0], [-3, 5, 1, 0]]), Alphabet=DNA, CharOrder="ATGC")
     # remove translation table
     del p.__dict__["_translation_table"]
     # then score the profile
     s1 = p.score(DNA.Sequence("ATTCAC"), offset=0)
     self.assertEqual(s1, [6, 2, -3, 0])
예제 #13
0
 def test_score_no_trans_table(self):
     """score: should work when no translation table is present
     """
     p = Profile(Data=array([[-1,0,1,2],[-2,2,0,0],[-3,5,1,0]]),\
         Alphabet=DNA, CharOrder="ATGC")
     # remove translation table
     del p.__dict__['_translation_table']
     # then score the profile
     s1 = p.score(DNA.Sequence("ATTCAC"),offset=0)
     self.assertEqual(s1, [6,2,-3,0])
예제 #14
0
def mVOR(alignment, n=1000, order=DNA_ORDER):
    """Returns sequence weights according to the modified Voronoi method.
    
    alignment: Alignment object
    n: sample size (=number of random profiles to be generated)
    order: specifies the order of the characters found in the alignment,
        used to build the sequence and random profiles.
    
    mVOR is a modification of the VOR method. Instead of generating discrete
    random sequences, it generates random profiles, to sample more equally from
    the sequence space and to prevent random sequences to be equidistant to 
    multiple sequences in the alignment. 

    See the Implementation notes to see how the random profiles are generated
    and compared to the 'sequence profiles' from the alignment.

    Random generalized sequences (or a profile filled with random numbers):
    Sequences that are equidistant to multiple sequences in the alignment
    can form a problem in small datasets. For longer sequences the likelihood
    of this event is negligable. Generating 'random generalized sequences' is 
    a solution, because we're then sampling from continuous sequence space. 
    Each column of a random profile is generated by normalizing a set of 
    independent, exponentially distributed random numbers. In other words, a 
    random profile is a two-dimensional array (rows are chars in the alphabet, 
    columns are positions in the alignment) filled with a random numbers, 
    sampled from the standard exponential distribution (lambda=1, and thus 
    the mean=1), where each column is normalized to one. These random profiles 
    are compared to the special profiles of just one sequence (ones for the 
    single character observed at that position). The distance between the 
    two profiles is simply the Euclidean distance.

    """

    weights = zeros(len(alignment.Names), Float64)

    #get seq profiles
    seq_profiles = {}
    for k, v in alignment.items():
        #seq_profiles[k] = ProfileFromSeq(v,order=order)
        seq_profiles[k] = SeqToProfile(v, alphabet=order)

    for count in range(n):
        #generate a random profile
        exp = exponential(1, [alignment.SeqLen, len(order)])
        r = Profile(Data=exp, Alphabet=order)
        r.normalizePositions()
        #append the distance between the random profile and the sequence
        #profile to temp
        temp = [seq_profiles[key].distance(r) for key in alignment.Names]
        votes = row_to_vote(array(temp))
        weights += votes
    weight_dict = Weights(dict(zip(alignment.Names, weights)))
    weight_dict.normalize()
    return weight_dict
예제 #15
0
def mVOR(alignment,n=1000,order=DNA_ORDER):
    """Returns sequence weights according to the modified Voronoi method.
    
    alignment: Alignment object
    n: sample size (=number of random profiles to be generated)
    order: specifies the order of the characters found in the alignment,
        used to build the sequence and random profiles.
    
    mVOR is a modification of the VOR method. Instead of generating discrete
    random sequences, it generates random profiles, to sample more equally from
    the sequence space and to prevent random sequences to be equidistant to 
    multiple sequences in the alignment. 

    See the Implementation notes to see how the random profiles are generated
    and compared to the 'sequence profiles' from the alignment.

    Random generalized sequences (or a profile filled with random numbers):
    Sequences that are equidistant to multiple sequences in the alignment
    can form a problem in small datasets. For longer sequences the likelihood
    of this event is negligable. Generating 'random generalized sequences' is 
    a solution, because we're then sampling from continuous sequence space. 
    Each column of a random profile is generated by normalizing a set of 
    independent, exponentially distributed random numbers. In other words, a 
    random profile is a two-dimensional array (rows are chars in the alphabet, 
    columns are positions in the alignment) filled with a random numbers, 
    sampled from the standard exponential distribution (lambda=1, and thus 
    the mean=1), where each column is normalized to one. These random profiles 
    are compared to the special profiles of just one sequence (ones for the 
    single character observed at that position). The distance between the 
    two profiles is simply the Euclidean distance.

    """
    
    weights = zeros(len(alignment.Names),Float64)

    #get seq profiles
    seq_profiles = {}
    for k,v in list(alignment.items()):
        #seq_profiles[k] = ProfileFromSeq(v,order=order)
        seq_profiles[k] = SeqToProfile(v,alphabet=order)

    for count in range(n):
        #generate a random profile
        exp = exponential(1,[alignment.SeqLen,len(order)])
        r = Profile(Data=exp,Alphabet=order)
        r.normalizePositions()
        #append the distance between the random profile and the sequence
        #profile to temp
        temp = [seq_profiles[key].distance(r) for key in alignment.Names]
        votes = row_to_vote(array(temp))
        weights += votes
    weight_dict = Weights(dict(list(zip(alignment.Names,weights))))
    weight_dict.normalize()
    return weight_dict
예제 #16
0
 def test_columnUncertainty(self):
     """columnUncertainty: should handle full and empty profiles
     """
     p = Profile(array([[0.25, 0.5], [0.25, 0.5], [0.25, 0], [0.25, 0]]), "AB")
     self.assertEqual(p.columnUncertainty(), [2, 1])
     # for empty cols nothing is returned as the uncertainty
     self.assertEqual(self.empty.columnUncertainty().tolist(), [])
     p = Profile(array([[], [], []]), "")
     self.assertEqual(p.columnUncertainty().tolist(), [])
     # doesn't work on 1D array
     self.assertRaises(ProfileError, self.oned.columnUncertainty)
예제 #17
0
    def test_reduce_normalization_error(self):
        """reduce: fails when input or output can't be normalized"""
        #Will raise errors when input data can't be normalized
        self.assertRaises(ProfileError,self.empty.reduce,self.empty,add)
        self.assertRaises(ProfileError,self.full.reduce,self.empty_row,add)

        #don't normalize input, but do normalize output
        #fails when one row adds up to zero
        p1 = Profile(array([[3,3],[4,4]]),"AB")
        p2 = Profile(array([[3,3],[-4,-4]]),"AB")
        self.assertRaises(ProfileError,p1.reduce,p2,add,False,True)
예제 #18
0
    def test__div_(self):
        """__div__ and __truediv__: always true division b/c __future__.division
        """
        p1 = Profile(array([[2, 3], [4, 5]]), "AB")
        p2 = Profile(array([[1, 0], [4, 5]]), "AB")  # Int 0
        p3 = Profile(array([[1, 0.0], [4, 5]]), "AB")  # Float 0.0
        p4 = Profile(array([[1, 2], [8.0, 5]]), "AB")  # Float 0.0

        self.assertRaises(ProfileError, p1.__truediv__, p2)
        # infinity in result data
        self.assertRaises(ProfileError, p1.__div__, p3)
        self.assertFloatEqual((p1.__div__(p4)).Data, array([[2, 1.5], [0.5, 1]]))
예제 #19
0
    def test_rowUncertainty(self):
        """rowUncertainty: should handle full and empty profiles
        """
        p = Profile(array([[0.25, 0.25, 0.25, 0.25], [0.5, 0.5, 0, 0]]), "ABCD")
        self.assertEqual(p.rowUncertainty(), [2, 1])

        # for empty rows 0 is returned as the uncertainty
        self.assertEqual(self.empty.rowUncertainty().tolist(), [])
        p = Profile(array([[], [], []]), "")
        self.assertEqual(p.rowUncertainty().tolist(), [])
        # doesn't work on 1D array
        self.assertRaises(ProfileError, self.oned.rowUncertainty)
예제 #20
0
    def test_copy(self):
        """copy: should act as expected while rebinding/modifying attributes
        """
        p = Profile(array([[1, 1], [.7, .3]]), {
            'A': 'A',
            'G': 'G',
            'R': 'AG'
        }, "AG")
        p_copy = p.copy()
        assert p.Data is p_copy.Data
        assert p.Alphabet is p_copy.Alphabet
        assert p.CharOrder is p_copy.CharOrder

        #modifying p.Data modifies p_copy.Data
        p.Data[1, 1] = 100
        assert p.Alphabet is p_copy.Alphabet

        #normalizing p.Data rebinds it, so p_copy.Data is unchanged
        p.normalizePositions()
        assert not p.Data is p_copy.Data

        #Adding something to the alphabet changes both p and p_copy
        p.Alphabet['Y'] = 'TC'
        assert p.Alphabet is p_copy.Alphabet

        #Rebinding the CharOrder does only change the original
        p.CharOrder = 'XX'
        assert not p.CharOrder is p_copy.CharOrder
예제 #21
0
 def test_hasValidAttributes(self):
     """hasValidAttributes: should work for different alphabets/char orders
     """
     p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="BAC")
     #self.Data doesn't match len(CharOrder)
     self.assertEqual(p.hasValidAttributes(),False)
     p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="AX")
     #not all chars in CharOrder in Alphabet
     self.assertEqual(p.hasValidAttributes(),False)
     p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="CB")
     #should be fine
     self.assertEqual(p.hasValidAttributes(),True)
예제 #22
0
    def test_isValid(self):
        """isValid: should work as expected"""
        #everything valid
        p1 = Profile(array([[.3,.7],[.8,.2]]),Alphabet="AB",CharOrder="AB")
        #invalid data, valid attributes
        p2 = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="BA")
        #invalid attributes, valid data
        p3 = Profile(array([[.3,.7],[.8,.2]]),Alphabet="ABCD",CharOrder="AF")

        self.assertEqual(p1.isValid(),True)
        self.assertEqual(p2.isValid(),False)
        self.assertEqual(p3.isValid(),False)
예제 #23
0
    def test_toConsensus(self):
        """toConsensus: should work with all the different options
        """
        p = self.consensus
        self.assertEqual(p.toConsensus(fully_degenerate=False),"AGGAT")
        self.assertEqual(p.toConsensus(fully_degenerate=True),"WVGNY")
        self.assertEqual(p.toConsensus(cutoff=0.75),"ARGHY")
        self.assertEqual(p.toConsensus(cutoff=0.95),"WVGNY")
        self.assertEqual(p.toConsensus(cutoff=2),"WVGNY")

        p = self.not_same_value
        self.assertEqual(p.toConsensus(fully_degenerate=False),"CGTA")
        self.assertEqual(p.toConsensus(fully_degenerate=True),"NBYA")
        self.assertEqual(p.toConsensus(cutoff=0.75),"YSYA")
        self.assertEqual(p.toConsensus(cutoff=2),"NBYA")
        self.assertEqual(p.toConsensus(cutoff=5),"NBYA")

        #when you specify both fully_generate and a cutoff value
        #the cutoff takes priority and is used in the calculation
        self.assertEqual(p.toConsensus(cutoff=0.75,fully_degenerate=True),\
            "YSYA")

        #raises AttributeError when Alphabet doens't have Degenerates
        p = Profile(array([[.2,.8],[.7,.3]]),"AB")
        self.assertRaises(AttributeError,p.toConsensus,cutoff=.5)
예제 #24
0
    def test_pos_char_weights(self):
        """pos_char_weights: should return correct contributions at each pos
        """
        #build expected profile
        exp_data = zeros([len(PROTEIN_ORDER), self.aln2.SeqLen], Float64)
        exp = [{
            'G': 1 / 4
        }, {
            'Y': 1 / 6,
            'F': 1 / 2
        }, {
            'V': 1 / 3,
            'D': 1 / 6,
            'Q': 1 / 3
        }, {
            'G': 1 / 4
        }, {
            'G': 1 / 3,
            'F': 1 / 6,
            'S': 1 / 3
        }]
        for pos, weights in enumerate(exp):
            for k, v in weights.items():
                exp_data[PROTEIN_ORDER.index(k), pos] = v
        exp_aln2 = Profile(exp_data, Alphabet=PROTEIN_ORDER)

        #check observed against expected
        self.assertEqual(
            pos_char_weights(self.aln2, PROTEIN_ORDER).Data, exp_aln2.Data)
예제 #25
0
def pos_char_weights(alignment, order=DNA_ORDER):
    """Returns the contribution of each character at each position.

    alignment: Alignemnt object
    order: the order of characters in the profile (all observed chars
        in the alignment
    
    This function is used by the function position_based
    
    For example: 
    GYVGS
    GFDGF
    GYDGF
    GYQGG
    
        0       1       2       3       4       5   
    G   1/1*4                           1/1*4   1/3*1
    Y           1/2*3
    F           1/2*1                           1/3*2
    V                   1/3*1
    D                   1/3*2
    Q                   1/3*1
    S                                           1/3*1
    """
    counts = alignment.columnFreqs()
    a = zeros([len(order), alignment.SeqLen], Float64)
    for col, c in enumerate(counts):
        for char in c:
            a[order.index(char), col] = 1 / (len(c) * c[char])
    return Profile(a, Alphabet=order)
예제 #26
0
 def test_toLogOddsMatrix(self):
     """toLogOddsMatrix: should work as expected"""
     #This test can be short, because it mainly depends on toOddsMatrix
     #for which everything has been tested
     p = Profile(array([[.1,.3,.5,.1],[.25,.25,.25,.25],\
         [.05,.8,.05,.1],[.7,.1,.1,.1],[.6,.15,.05,.2]]),\
         Alphabet="ACTG")
     p_exp = Profile(array(\
         [[-1.322, 0.263, 1., -1.322],\
          [ 0., 0., 0., 0.],\
          [-2.322,  1.678, -2.322, -1.322],\
          [ 1.485, -1.322, -1.322, -1.322],\
          [ 1.263, -0.737, -2.322, -0.322]]),\
          Alphabet="ACTG")
     self.assertFloatEqual(p.toLogOddsMatrix().Data,p_exp.Data,eps=1e-3) 
     #works on empty matrix
     self.assertEqual(self.empty.toLogOddsMatrix().Data.tolist(),[[]])
예제 #27
0
    def test_reduce_operators(self):
        """reduce: should work fine with different operators
        """
        # different operators, normalize input, don't normalize output
        p1 = Profile(array([[1, 0, 0], [0, 1, 0]]), Alphabet="ABC")
        p2 = Profile(array([[1, 0, 0], [0, 0, 1]]), Alphabet="ABC")

        self.assertEqual(p1.reduce(p2).Data, array([[1, 0, 0], [0, 0.5, 0.5]]))
        self.assertEqual(
            p1.reduce(p2, add, normalize_input=True, normalize_output=False).Data, array([[2, 0, 0], [0, 1, 1]])
        )
        self.assertEqual(
            p1.reduce(p2, subtract, normalize_input=True, normalize_output=False).Data, array([[0, 0, 0], [0, 1, -1]])
        )
        self.assertEqual(
            p1.reduce(p2, multiply, normalize_input=True, normalize_output=False).Data, array([[1, 0, 0], [0, 0, 0]])
        )

        self.assertRaises(ProfileError, p1.reduce, p2, divide, normalize_input=True, normalize_output=False)

        # don't normalize and normalize only input
        p3 = Profile(array([[1, 2], [3, 4]]), Alphabet="AB")
        p4 = Profile(array([[4, 3], [2, 1]]), Alphabet="AB")

        self.assertEqual(
            p3.reduce(p4, add, normalize_input=False, normalize_output=False).Data, array([[5, 5], [5, 5]])
        )
        self.assertFloatEqual(
            p3.reduce(p4, add, normalize_input=True, normalize_output=False).Data,
            array([[19 / 21, 23 / 21], [23 / 21, 19 / 21]]),
        )

        # normalize input and output
        p5 = Profile(array([[1, 1, 0, 0], [1, 1, 1, 1]]), Alphabet="ABCD")
        p6 = Profile(array([[1, 0, 0, 0], [1, 0, 0, 1]]), Alphabet="ABCD")

        self.assertEqual(
            p5.reduce(p6, add, normalize_input=True, normalize_output=True).Data,
            array([[0.75, 0.25, 0, 0], [0.375, 0.125, 0.125, 0.375]]),
        )

        # it can collapse empty profiles when normalizing is turned off
        self.assertEqual(
            self.empty.reduce(self.empty, normalize_input=False, normalize_output=False).Data.tolist(), [[]]
        )
예제 #28
0
    def test__score_profile(self):
        """_score_profile: should work on valid input"""
        p1 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,.5,.5],[0,0,0,1],\
            [.25,.25,.25,.25]]),"TCAG")
        p2 = Profile(array([[0,1,0,0],[.2,0,.8,0],[0,0,.5,.5],[1/3,1/3,0,1/3],\
            [.25,.25,.25,.25]]),"TCAG")

        self.assertFloatEqual(self.score2._score_profile(p1,offset=0),\
            [.55,1.25,.45])
        self.assertFloatEqual(self.score2._score_profile(p1,offset=2),\
            [.45])
        self.assertFloatEqual(self.score2._score_profile(p2,offset=0),\
            [1.49,1.043,.483],1e-3)

        #Errors will be raised on invalid input. Errors are not handled
        #in this method. Validation of the input is done elsewhere
        #In this case you don't get an error, but for sure an unexpected
        #result
        self.assertFloatEqual(self.score2._score_profile(p1,offset=3).tolist(),\
            [])
예제 #29
0
    def test_normalizeSequences(self):
        """normalizeSequences: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data, array([[2 / 9, 4 / 17], [3 / 9, 5 / 17], [4 / 9, 8 / 17]]))
        self.assertEqual(sum(p.Data, axis=0), [1, 1])
        p = self.empty_row.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data, array([[1, 1], [0, 0]]))
        p = self.empty_col.copy()
        self.assertRaises(ProfileError, p.normalizeSequences)
        p = Profile(array([[0.0], [0.0]]), "AB")
        self.assertRaises(ProfileError, p.normalizeSequences)

        # negative numbers!!!!!!
        p1 = Profile(array([[3, 4], [-2, -3]]), "AB")
        p1.normalizeSequences()
        self.assertEqual(p1.Data, array([[3, 4], [-2, -3]]))
        p2 = Profile(array([[3, 4], [-3, -3]]), "AB")
        self.assertRaises(ProfileError, p2.normalizeSequences)
예제 #30
0
    def test_normalizePositions(self):
        """normalizePositions: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizePositions()
        self.assertEqual(p.Data, array([[2 / 6, 4 / 6], [3 / 8, 5 / 8], [4 / 12, 8 / 12]]))
        self.assertEqual(sum(p.Data, 1), [1, 1, 1])
        p = self.empty_col.copy()
        p.normalizePositions()
        self.assertEqual(p.Data, array([[0, 1], [0, 1]]))
        p = self.empty_row.copy()
        self.assertRaises(ProfileError, p.normalizePositions)
        p = Profile(array([[0.0, 0.0]]), "AB")
        self.assertRaises(ProfileError, p.normalizePositions)

        # negative numbers!!!!!!
        p1 = Profile(array([[3, -2], [4, -3]]), "AB")
        p1.normalizePositions()
        self.assertEqual(p1.Data, array([[3, -2], [4, -3]]))
        p2 = Profile(array([[3, -3], [4, -3]]), "AB")
        self.assertRaises(ProfileError, p2.normalizePositions)
예제 #31
0
 def test_toOddsMatrix(self):
     """toOddsMatrix: should work on valid data or raise an error
     """
     p = Profile(array([[.1,.3,.5,.1],[.25,.25,.25,.25],\
         [.05,.8,.05,.1],[.7,.1,.1,.1],[.6,.15,.05,.2]]),\
         Alphabet="ACTG")
     p_exp = Profile(array([[.4, 1.2, 2, .4],[1,1,1,1],[.2,3.2,.2,.4],\
         [2.8,.4,.4,.4],[2.4,.6,.2,.8]]),Alphabet="ACTG")
     self.assertEqual(p.toOddsMatrix().Data,p_exp.Data)
     assert p.Alphabet is p.toOddsMatrix().Alphabet
     self.assertEqual(p.toOddsMatrix([.25,.25,.25,.25]).Data,p_exp.Data)
     
     #fails if symbol_freqs has wrong size
     self.assertRaises(ProfileError, p.toOddsMatrix,\
         [.25,.25,.25,.25,.25,.25])
     self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix,\
         [.1,.2,.3])
     #works on empty profile
     self.assertEqual(self.empty.toOddsMatrix().Data.tolist(),[[]])
     #works with different input
     self.assertEqual(self.zero_entry.toOddsMatrix().Data,\
         array([[1.2,.8,0,2],[0,0,3.2,.8]]))
     self.assertFloatEqual(self.zero_entry.toOddsMatrix([.1,.2,.3,.4]).Data,\
         array([[3,1,0,1.25],[0,0,2.667,.5]]),1e-3)
     #fails when one of the background frequencies is 0
     self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix,\
         [.1,.2,.3,0])
예제 #32
0
 def test_init(self):
     """__init__: should set all attributed correctly"""
     self.assertRaises(TypeError, Profile)
     self.assertRaises(TypeError, Profile, array([[2,3]]))
     #only alphabet
     p = Profile(array([[.2,.8],[.7,.3]]),"AB")
     self.assertEqual(p.Data, [[.2,.8],[.7,.3]])
     self.assertEqual(p.Alphabet, "AB")
     self.assertEqual(p.CharOrder, list("AB"))
     self.assertEqual(translate("ABBA",p._translation_table),
         "\x00\x01\x01\x00")
     #alphabet and char order
     p = Profile(array([[.1,.2],[.4,.3]]),Alphabet=DNA,
         CharOrder="AG")
     self.assertEqual(p.CharOrder,"AG")
     assert p.Alphabet is DNA
     #non-character alphabet        
     p = Profile(array([[.1,.2],[.4,.3]]),Alphabet=[7,3],
         CharOrder=[3,7])
     self.assertEqual(p.CharOrder,[3,7])
     self.assertEqual(p.Alphabet, [7,3])
     self.assertEqual(p.Data, [[.1,.2],[.4,.3]])
예제 #33
0
 def test_columnUncertainty(self):
     """columnUncertainty: should handle full and empty profiles
     """
     p = Profile(array([[.25,.5],[.25,.5],[.25,0],[.25,0]]),"AB")
     self.assertEqual(p.columnUncertainty(),[2,1])
     #for empty cols nothing is returned as the uncertainty
     self.assertEqual(self.empty.columnUncertainty().tolist(),[])
     p = Profile(array([[],[],[]]),"")
     self.assertEqual(p.columnUncertainty().tolist(),[])
     #doesn't work on 1D array
     self.assertRaises(ProfileError,self.oned.columnUncertainty)
예제 #34
0
 def test_rowUncertainty(self):
     """rowUncertainty: should handle full and empty profiles
     """
     p = Profile(array([[.25,.25,.25,.25],[.5,.5,0,0]]),"ABCD")
     self.assertEqual(p.rowUncertainty(),[2,1])
     
     #for empty rows 0 is returned as the uncertainty
     self.assertEqual(self.empty.rowUncertainty().tolist(),[])
     p = Profile(array([[],[],[]]),"")
     self.assertEqual(p.rowUncertainty().tolist(),[])
     #doesn't work on 1D array
     self.assertRaises(ProfileError,self.oned.rowUncertainty)
예제 #35
0
    def test_isValid(self):
        """isValid: should work as expected"""
        # everything valid
        p1 = Profile(array([[0.3, 0.7], [0.8, 0.2]]), Alphabet="AB", CharOrder="AB")
        # invalid data, valid attributes
        p2 = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="BA")
        # invalid attributes, valid data
        p3 = Profile(array([[0.3, 0.7], [0.8, 0.2]]), Alphabet="ABCD", CharOrder="AF")

        self.assertEqual(p1.isValid(), True)
        self.assertEqual(p2.isValid(), False)
        self.assertEqual(p3.isValid(), False)
예제 #36
0
 def test_hasValidAttributes(self):
     """hasValidAttributes: should work for different alphabets/char orders
     """
     p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="BAC")
     # self.Data doesn't match len(CharOrder)
     self.assertEqual(p.hasValidAttributes(), False)
     p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="AX")
     # not all chars in CharOrder in Alphabet
     self.assertEqual(p.hasValidAttributes(), False)
     p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="CB")
     # should be fine
     self.assertEqual(p.hasValidAttributes(), True)
예제 #37
0
    def test__div_(self):
        """__div__ and __truediv__: always true division b/c __future__.division
        """
        p1 = Profile(array([[2,3],[4,5]]),"AB")
        p2 = Profile(array([[1,0],[4,5]]),"AB") #Int 0
        p3 = Profile(array([[1,0.0],[4,5]]),"AB") #Float 0.0
        p4 = Profile(array([[1,2],[8.0,5]]),"AB") #Float 0.0

        self.assertRaises(ProfileError, p1.__truediv__,p2)
        #infinity in result data
        self.assertRaises(ProfileError, p1.__div__, p3)
        self.assertFloatEqual((p1.__div__(p4)).Data, array([[2,1.5],[0.5,1]]))
예제 #38
0
def freqs_from_aln_array(seqs):
    """Returns per-position freqs from arbitrary size alignment.

    Warning: fails if all seqs aren't the same length.
    written by Rob Knight
    
    seqs = list of lines from aligned fasta file
    """
    result = None
    for label, seq in MinimalFastaParser(seqs):
        # Currently cogent does not support . characters for gaps, converting
        # to - characters for compatability.
        seq = ModelDnaSequence(seq.replace('.','-'))
        if result is None:
            result = zeros((len(seq.Alphabet), len(seq)),dtype=int)
            indices = arange(len(seq), dtype=int)
        result[seq._data,indices] += 1
    return Profile(result, seq.Alphabet)
예제 #39
0
 def test_toConsensus_include_all(self):
     """toConsensus: Should include all possibilities when include_all=True
     """
     p1 = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\
         [.2,.3,.4,.1],[.5,.5,0,0]]),\
         Alphabet=DNA, CharOrder="TCAG")
     self.assertEqual(p1.toConsensus(cutoff=0.4, include_all=True),\
         "AGGAY")
     p2 = Profile(array([[.25,0.25,.25,0.25],[0.1,.1,.1,0],\
         [.4,0,.4,0],[0,.2,0.2,0.3]]),\
         Alphabet=DNA, CharOrder="TCAG")
     self.assertEqual(p2.toConsensus(cutoff=0.4,\
         include_all=True), "NHWV")
예제 #40
0
    def test_distance(self):
        """distance: should return correct distance between the profiles
        """
        p1 = Profile(array([[2, 4], [3, 1]]), "AB")
        p2 = Profile(array([[4, 6], [5, 3]]), "AB")
        p3 = Profile(array([[4, 6], [5, 3], [1, 1]]), "AB")
        p4 = Profile(array([2, 2]), "AB")
        p5 = Profile(array([2, 2, 2]), "AB")
        p6 = Profile(array([[]]), "AB")

        self.assertEqual(p1.distance(p2), 4)
        self.assertEqual(p2.distance(p1), 4)
        self.assertEqual(p1.distance(p4), sqrt(6))
        self.assertEqual(p6.distance(p6), 0)

        # Raises error when frames are not aligned
        self.assertRaises(ProfileError, p1.distance, p3)
        self.assertRaises(ProfileError, p1.distance, p5)
예제 #41
0
 def setUp(self):
     """setUp method for all Profile tests"""
     self.full = Profile(array([[2,4],[3,5],[4,8]]),"AB")
     self.empty = Profile(array([[]]),"AB")
     self.empty_row = Profile(array([[1,1],[0,0]]), "AB")
     self.empty_col = Profile(array([[0,1],[0,1]]), "AB")
     self.consensus = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\
         [.2,.3,.4,.1],[.5,.5,0,0]]),\
         Alphabet=DNA, CharOrder="TCAG")
     self.not_same_value = Profile(array([[.3,.5,.1,.1],[.4,.6,0,.7],\
         [.3,.2,0,0],[0,0,4,0]]),Alphabet=DNA, CharOrder="TCAG")
     self.zero_entry = Profile(array([[.3,.2,0,.5],[0,0,.8,.2]]),\
         Alphabet="UCAG")
     self.score1 = Profile(Data=array([[-1,0,1,2],[-2,2,0,0],[-3,5,1,0]]),\
         Alphabet=DNA, CharOrder="ATGC")
     self.score2 = Profile(array([[.2,.4,.4,0],[.1,0,.9,0],[.1,.2,.3,.4]]),\
         Alphabet="TCAG")
     self.oned = Profile(array([.25,.25,.25,.25]),"ABCD")
     self.pp = Profile(array([[1,2,3,4],[5,6,7,8],[9,10,11,12]]),"ABCD")
예제 #42
0
    def test_copy(self):
        """copy: should act as expected while rebinding/modifying attributes
        """
        p = Profile(array([[1,1],[.7,.3]]),{'A':'A','G':'G','R':'AG'},"AG")
        p_copy = p.copy()
        assert p.Data is p_copy.Data
        assert p.Alphabet is p_copy.Alphabet
        assert p.CharOrder is p_copy.CharOrder
        
        #modifying p.Data modifies p_copy.Data
        p.Data[1,1] = 100
        assert p.Alphabet is p_copy.Alphabet
        
        #normalizing p.Data rebinds it, so p_copy.Data is unchanged
        p.normalizePositions()
        assert not p.Data is p_copy.Data
        
        #Adding something to the alphabet changes both p and p_copy
        p.Alphabet['Y']='TC'
        assert p.Alphabet is p_copy.Alphabet

        #Rebinding the CharOrder does only change the original
        p.CharOrder='XX'
        assert not p.CharOrder is p_copy.CharOrder
예제 #43
0
    def test_copy(self):
        """copy: should act as expected while rebinding/modifying attributes
        """
        p = Profile(array([[1, 1], [0.7, 0.3]]), {"A": "A", "G": "G", "R": "AG"}, "AG")
        p_copy = p.copy()
        assert p.Data is p_copy.Data
        assert p.Alphabet is p_copy.Alphabet
        assert p.CharOrder is p_copy.CharOrder

        # modifying p.Data modifies p_copy.Data
        p.Data[1, 1] = 100
        assert p.Alphabet is p_copy.Alphabet

        # normalizing p.Data rebinds it, so p_copy.Data is unchanged
        p.normalizePositions()
        assert not p.Data is p_copy.Data

        # Adding something to the alphabet changes both p and p_copy
        p.Alphabet["Y"] = "TC"
        assert p.Alphabet is p_copy.Alphabet

        # Rebinding the CharOrder does only change the original
        p.CharOrder = "XX"
        assert not p.CharOrder is p_copy.CharOrder
예제 #44
0
    def test_normalizePositions(self):
        """normalizePositions: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizePositions()
        self.assertEqual(p.Data,array([[2/6,4/6],[3/8,5/8],[4/12,8/12]]))
        self.assertEqual(sum(p.Data,1),[1,1,1])
        p = self.empty_col.copy()
        p.normalizePositions()
        self.assertEqual(p.Data,array([[0,1],[0,1]]))
        p = self.empty_row.copy()
        self.assertRaises(ProfileError,p.normalizePositions)
        p = Profile(array([[0.0,0.0]]),"AB")
        self.assertRaises(ProfileError,p.normalizePositions)

        #negative numbers!!!!!!
        p1 = Profile(array([[3,-2],[4,-3]]),"AB")
        p1.normalizePositions()
        self.assertEqual(p1.Data,array([[3,-2],[4,-3]]))
        p2 = Profile(array([[3,-3],[4,-3]]),"AB")
        self.assertRaises(ProfileError,p2.normalizePositions)
예제 #45
0
    def test_normalizeSequences(self):
        """normalizeSequences: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data,array([[2/9,4/17],[3/9,5/17],[4/9,8/17]]))
        self.assertEqual(sum(p.Data, axis=0),[1,1])
        p = self.empty_row.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data,array([[1,1],[0,0]]))
        p = self.empty_col.copy()
        self.assertRaises(ProfileError,p.normalizeSequences)
        p = Profile(array([[0.0],[0.0]]),"AB")
        self.assertRaises(ProfileError,p.normalizeSequences)

        #negative numbers!!!!!!
        p1 = Profile(array([[3,4],[-2,-3]]),"AB")
        p1.normalizeSequences()
        self.assertEqual(p1.Data,array([[3,4],[-2,-3]]))
        p2 = Profile(array([[3,4],[-3,-3]]),"AB")
        self.assertRaises(ProfileError,p2.normalizeSequences)
예제 #46
0
 def test_make_translation_table(self):
     """_make_translation_table: should return correct table from char order
     """
     p = Profile(array([[.2,.8],[.7,.3]]),"ABCDE","AB")
     self.assertEqual(translate("ABBA",p._translation_table),
         "\x00\x01\x01\x00")
예제 #47
0
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\
    weights=None):
    """Generates a Profile object from an Alignment.

    aln: Alignment object
    alphabet (optional): an Alphabet object (or list of chars, but if you 
        want to split degenerate symbols, the alphabet must have a 
        Degenerates property. Default is the alphabet of the first seq in 
        the alignment.
    char_order (optional): order of the characters in the profile. Default
        is list(alphabet)
    split_degenerates (optional): Whether you want the counts for the 
        degenerate symbols to be divided over the non-degenerate symbols they
        code for.
    weights (optional): dictionary of seq_id: weight. If not entered all seqs
        are weighted equally

    A Profile is a position x character matrix describing which characters
    occur at each position of an alignment. The Profile is always normalized,
    so it gives the probabilities of each character at each position.
    
    Ignoring chars: you can ignore characters in the alignment by not putting
    the char in the CharOrder. If you ignore all characters at a particular
    position, an error will be raised, because the profile can't be normalized.

    Splitting degenerates: you can split degenerate characters over the 
    non-degenerate characters they code for. For example: R = A or G. So,
    an R at a position counts for 0.5 A and 0.5 G.
   
    Example:
    seq1    TCAG    weight: 0.5
    seq2    TAR-    weight: 0.25
    seq3    YAG-    weight: 0.25
    Profile(aln,alphabet=DNA,char_order="TACG",weights=w,
    split_degenerates=True)
    Profile:
       T      A      C      G
    [[ 0.875  0.     0.125  0.   ]
     [ 0.     0.5    0.5    0.   ]
     [ 0.     0.625  0.     0.375]
     [ 0.     0.     0.     1.   ]]
    """

    if alphabet is None:
        alphabet = list(aln.values())[0].MolType
    if char_order is None:
        char_order = list(alphabet)
    if weights is None:
        weights = dict.fromkeys(list(aln.keys()),1/len(aln))
    
    char_meaning = CharMeaningProfile(alphabet, char_order,\
        split_degenerates)

    profiles = []
    for k,v in list(aln.items()):
        idxs = array(str(v).upper(), 'c').view(UInt8)
        profiles.append(char_meaning.Data[idxs] * weights[k])
    s = reduce(add,profiles)
    
    result = Profile(s,alphabet, char_order)
    try:
        result.normalizePositions()
    except Exception as e:
        raise ValueError(e)
        #"Probably one of the rows in your profile adds up to zero,\n "+\
        #"because you are ignoring all of the characters in the "+\
        #"corresponding\n column in the alignment"
    return result
예제 #48
0
#!/usr/bin/env python
# taken from http://pycogent.sourceforge.net/
from cogent.core.profile import Profile
from cogent import LoadSeqs, RNA
aln = LoadSeqs("data/trna_profile.fasta", moltype=RNA)
print len(aln.Seqs)
print len(aln)
pf = aln.getPosFreqs()
print pf.prettyPrint(include_header=True, column_limit=6, col_sep='   ')
pf.normalizePositions()
print pf.prettyPrint(include_header=True, column_limit=6, col_sep='   ')
print pf.isValid()
print '\n'.join(['%s: %.3f'%(c,f) for (c,f) in zip(pf.CharOrder, pf.dataAt(4)) if f!=0])
print pf.toConsensus(fully_degenerate=False)
pf.Alphabet=RNA
print "to consensus"
print pf.toConsensus(fully_degenerate=True)
print pf.toConsensus(cutoff=0.8)
print pf.toConsensus(cutoff=0.6)
loop_profile = Profile(pf.Data[54:60,:], Alphabet=RNA, CharOrder=pf.CharOrder)
print loop_profile.prettyPrint(include_header=True, column_limit=6, col_sep='   ')
yeast = RNA.Sequence('GCGGAUUUAGCUCAGUU-GGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA')
scores = loop_profile.score(yeast)
print scores
print max(scores)
print scores.argmax()
예제 #49
0
from cogent.core.profile import Profile
from cogent import LoadSeqs, RNA
aln = LoadSeqs("data/trna_profile.fasta", moltype=RNA)
print len(aln.Seqs)
print len(aln)
pf = aln.getPosFreqs()
print pf.prettyPrint(include_header=True, column_limit=6, col_sep='   ')
pf.normalizePositions()
print pf.prettyPrint(include_header=True, column_limit=6, col_sep='   ')
print pf.isValid()
print '\n'.join([
    '%s: %.3f' % (c, f) for (c, f) in zip(pf.CharOrder, pf.dataAt(4)) if f != 0
])
print pf.toConsensus(fully_degenerate=False)
pf.Alphabet = RNA
print "to consensus"
print pf.toConsensus(fully_degenerate=True)
print pf.toConsensus(cutoff=0.8)
print pf.toConsensus(cutoff=0.6)
loop_profile = Profile(pf.Data[54:60, :], Alphabet=RNA, CharOrder=pf.CharOrder)
print loop_profile.prettyPrint(include_header=True,
                               column_limit=6,
                               col_sep='   ')
yeast = RNA.Sequence(
    'GCGGAUUUAGCUCAGUU-GGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA'
)
scores = loop_profile.score(yeast)
print scores
print max(scores)
print scores.argmax()
예제 #50
0
def SeqToProfile(seq, alphabet=None, char_order=None,\
    split_degenerates=False):
    """Generates a Profile object from a Sequence object.

    seq: Sequence object
    alphabet (optional): Alphabet object (if you want to split
        degenerate symbols, the alphabet object should have a 
        Degenerates property. Default is the Alphabet associated with 
        the Sequence object.
    char_order (optional): The order the characters occur in the Profile.
        Default is the list(alphabet)
    split_degenerates (optional): Whether you want the counts for the 
        degenerate symbols to be divided over the non-degenerate symbols they
        code for.
    
    A Profile is a position x character matrix describing which characters
    occur at each position. In a sequence (as opposed to an alignment) only
    one character occurs at each position. In general a sequence profile
    will only contain ones and zeros. However, you have the possibility of 
    splitting degenerate characters. For example, if a position is R, it 
    means that you have 50/50% chance of A and G. It's also possible to 
    ignore characters, which in a sequence profile will lead to positions
    (rows) containing only zeros.
    
    Example:
    Sequence = ACGU
    Profile(seq, CharOrder=UCAG):
    U   C   A   G
    0   0   1   0   first pos
    0   1   0   0   second pos
    0   0   0   1   third pos
    1   0   0   0   fourth pos

    Sequence= GURY
    Profile(seq,CharOrder=UCAG, split_degenerates=True)
    U   C   A   G
    0   0   0   1   first pos
    1   0   0   0   second pos
    0   0   .5  .5  third pos
    .5  .5  0   0   fourth pos

    Characters can also be ignored
    Sequence = ACN-
    Profile(seq, CharOrder=UCAGN, split_degenerates=True)
    U   C   A   G
    0   0   1   0   first pos
    0   1   0   0   second pos
    .25 .25 .25 .25 third pos
    0   0   0   0   fourth pos <--contains only zeros
    """

    if alphabet is None:
        alphabet = seq.MolType
    if char_order is None:
        char_order = list(alphabet)

    #Determine the meaning of each character based on the alphabet, the
    #character order, and the option to split degenerates
    char_meaning = CharMeaningProfile(alphabet, char_order,\
        split_degenerates)
    #construct profile data
    idxs = array(str(seq).upper(), 'c').view(UInt8)
    result_data = char_meaning.Data[idxs]
    #result_data = take(char_meaning.Data, asarray(str(seq).upper(), UInt8), axis=0)

    return Profile(result_data, alphabet, char_order)
예제 #51
0
class ProfileTests(TestCase):
    """Tests for Profile object"""

    def setUp(self):
        """setUp method for all Profile tests"""
        self.full = Profile(array([[2, 4], [3, 5], [4, 8]]), "AB")
        self.empty = Profile(array([[]]), "AB")
        self.empty_row = Profile(array([[1, 1], [0, 0]]), "AB")
        self.empty_col = Profile(array([[0, 1], [0, 1]]), "AB")
        self.consensus = Profile(
            array([[0.2, 0, 0.8, 0], [0, 0.1, 0.2, 0.7], [0, 0, 0, 1], [0.2, 0.3, 0.4, 0.1], [0.5, 0.5, 0, 0]]),
            Alphabet=DNA,
            CharOrder="TCAG",
        )
        self.not_same_value = Profile(
            array([[0.3, 0.5, 0.1, 0.1], [0.4, 0.6, 0, 0.7], [0.3, 0.2, 0, 0], [0, 0, 4, 0]]),
            Alphabet=DNA,
            CharOrder="TCAG",
        )
        self.zero_entry = Profile(array([[0.3, 0.2, 0, 0.5], [0, 0, 0.8, 0.2]]), Alphabet="UCAG")
        self.score1 = Profile(Data=array([[-1, 0, 1, 2], [-2, 2, 0, 0], [-3, 5, 1, 0]]), Alphabet=DNA, CharOrder="ATGC")
        self.score2 = Profile(array([[0.2, 0.4, 0.4, 0], [0.1, 0, 0.9, 0], [0.1, 0.2, 0.3, 0.4]]), Alphabet="TCAG")
        self.oned = Profile(array([0.25, 0.25, 0.25, 0.25]), "ABCD")
        self.pp = Profile(array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]), "ABCD")

    def test_init(self):
        """__init__: should set all attributed correctly"""
        self.assertRaises(TypeError, Profile)
        self.assertRaises(TypeError, Profile, array([[2, 3]]))
        # only alphabet
        p = Profile(array([[0.2, 0.8], [0.7, 0.3]]), "AB")
        self.assertEqual(p.Data, [[0.2, 0.8], [0.7, 0.3]])
        self.assertEqual(p.Alphabet, "AB")
        self.assertEqual(p.CharOrder, list("AB"))
        self.assertEqual(translate("ABBA", p._translation_table), "\x00\x01\x01\x00")
        # alphabet and char order
        p = Profile(array([[0.1, 0.2], [0.4, 0.3]]), Alphabet=DNA, CharOrder="AG")
        self.assertEqual(p.CharOrder, "AG")
        assert p.Alphabet is DNA
        # non-character alphabet
        p = Profile(array([[0.1, 0.2], [0.4, 0.3]]), Alphabet=[7, 3], CharOrder=[3, 7])
        self.assertEqual(p.CharOrder, [3, 7])
        self.assertEqual(p.Alphabet, [7, 3])
        self.assertEqual(p.Data, [[0.1, 0.2], [0.4, 0.3]])

    def test_str(self):
        """__str__: should return string representation of data in profile
        """
        self.assertEqual(str(self.empty_row), str(array([[1, 1], [0, 0]])))

    def test_make_translation_table(self):
        """_make_translation_table: should return correct table from char order
        """
        p = Profile(array([[0.2, 0.8], [0.7, 0.3]]), "ABCDE", "AB")
        self.assertEqual(translate("ABBA", p._translation_table), "\x00\x01\x01\x00")

    def test_hasValidData(self):
        """hasValidData: should work on full and empty profiles"""
        full = self.full.copy()
        full.normalizePositions()
        self.assertEqual(full.hasValidData(), True)
        self.assertEqual(self.empty_row.hasValidData(), False)
        self.assertEqual(self.empty.hasValidData(), False)

    def test_hasValidAttributes(self):
        """hasValidAttributes: should work for different alphabets/char orders
        """
        p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="BAC")
        # self.Data doesn't match len(CharOrder)
        self.assertEqual(p.hasValidAttributes(), False)
        p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="AX")
        # not all chars in CharOrder in Alphabet
        self.assertEqual(p.hasValidAttributes(), False)
        p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="CB")
        # should be fine
        self.assertEqual(p.hasValidAttributes(), True)

    def test_isValid(self):
        """isValid: should work as expected"""
        # everything valid
        p1 = Profile(array([[0.3, 0.7], [0.8, 0.2]]), Alphabet="AB", CharOrder="AB")
        # invalid data, valid attributes
        p2 = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="BA")
        # invalid attributes, valid data
        p3 = Profile(array([[0.3, 0.7], [0.8, 0.2]]), Alphabet="ABCD", CharOrder="AF")

        self.assertEqual(p1.isValid(), True)
        self.assertEqual(p2.isValid(), False)
        self.assertEqual(p3.isValid(), False)

    def test_dataAt(self):
        """dataAt: should work on valid position and character"""
        p = Profile(array([[0.2, 0.4, 0.4, 0], [0.1, 0, 0.9, 0], [0.1, 0.2, 0.3, 0.4]]), Alphabet="TCAG")
        self.assertEqual(p.dataAt(0, "C"), 0.4)
        self.assertEqual(p.dataAt(1, "T"), 0.1)
        self.assertRaises(ProfileError, p.dataAt, 1, "U")
        self.assertRaises(ProfileError, p.dataAt, -2, "T")
        self.assertRaises(ProfileError, p.dataAt, 5, "T")

    def test_copy(self):
        """copy: should act as expected while rebinding/modifying attributes
        """
        p = Profile(array([[1, 1], [0.7, 0.3]]), {"A": "A", "G": "G", "R": "AG"}, "AG")
        p_copy = p.copy()
        assert p.Data is p_copy.Data
        assert p.Alphabet is p_copy.Alphabet
        assert p.CharOrder is p_copy.CharOrder

        # modifying p.Data modifies p_copy.Data
        p.Data[1, 1] = 100
        assert p.Alphabet is p_copy.Alphabet

        # normalizing p.Data rebinds it, so p_copy.Data is unchanged
        p.normalizePositions()
        assert not p.Data is p_copy.Data

        # Adding something to the alphabet changes both p and p_copy
        p.Alphabet["Y"] = "TC"
        assert p.Alphabet is p_copy.Alphabet

        # Rebinding the CharOrder does only change the original
        p.CharOrder = "XX"
        assert not p.CharOrder is p_copy.CharOrder

    def test_normalizePositions(self):
        """normalizePositions: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizePositions()
        self.assertEqual(p.Data, array([[2 / 6, 4 / 6], [3 / 8, 5 / 8], [4 / 12, 8 / 12]]))
        self.assertEqual(sum(p.Data, 1), [1, 1, 1])
        p = self.empty_col.copy()
        p.normalizePositions()
        self.assertEqual(p.Data, array([[0, 1], [0, 1]]))
        p = self.empty_row.copy()
        self.assertRaises(ProfileError, p.normalizePositions)
        p = Profile(array([[0.0, 0.0]]), "AB")
        self.assertRaises(ProfileError, p.normalizePositions)

        # negative numbers!!!!!!
        p1 = Profile(array([[3, -2], [4, -3]]), "AB")
        p1.normalizePositions()
        self.assertEqual(p1.Data, array([[3, -2], [4, -3]]))
        p2 = Profile(array([[3, -3], [4, -3]]), "AB")
        self.assertRaises(ProfileError, p2.normalizePositions)

    def test_normalizeSequences(self):
        """normalizeSequences: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data, array([[2 / 9, 4 / 17], [3 / 9, 5 / 17], [4 / 9, 8 / 17]]))
        self.assertEqual(sum(p.Data, axis=0), [1, 1])
        p = self.empty_row.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data, array([[1, 1], [0, 0]]))
        p = self.empty_col.copy()
        self.assertRaises(ProfileError, p.normalizeSequences)
        p = Profile(array([[0.0], [0.0]]), "AB")
        self.assertRaises(ProfileError, p.normalizeSequences)

        # negative numbers!!!!!!
        p1 = Profile(array([[3, 4], [-2, -3]]), "AB")
        p1.normalizeSequences()
        self.assertEqual(p1.Data, array([[3, 4], [-2, -3]]))
        p2 = Profile(array([[3, 4], [-3, -3]]), "AB")
        self.assertRaises(ProfileError, p2.normalizeSequences)

    def test_prettyPrint_without_parameters(self):
        """prettyPrint: should work without parameters passed in"""
        p = self.full
        self.assertEqual(p.prettyPrint(), "2\t4\n3\t5\n4\t8")
        self.assertEqual(p.prettyPrint(include_header=True), "A\tB\n2\t4\n3\t5\n4\t8")
        self.assertEqual(p.prettyPrint(transpose_data=True), "2\t3\t4\n4\t5\t8")
        self.assertEqual(p.prettyPrint(include_header=True, transpose_data=True), "A\t2\t3\t4\nB\t4\t5\t8")
        # empty
        self.assertEqual(self.empty.prettyPrint(), "")
        self.assertEqual(self.empty.prettyPrint(transpose_data=True), "")

        # it will still print with invalid data (e.g if len(CharOrder)
        # doesn't match the data
        p = self.full.copy()
        p.CharOrder = "ABC"

        self.assertEqual(p.prettyPrint(include_header=True), "A\tB\tC\n2\t4\t \n3\t5\t \n4\t8\t ")
        # it will truncate the CharOrder if data is transposed
        # and CharOrder is longer then the number of rows in the
        # transposed data
        self.assertEqual(p.prettyPrint(include_header=True, transpose_data=True), "A\t2\t3\t4\nB\t4\t5\t8")

    def test_prettyPrint_four_cases(self):
        """prettyPrint: with/without header/transpose/limit"""
        p = self.full
        p = self.pp
        self.assertEqual(p.prettyPrint(), "1\t 2\t 3\t 4\n5\t 6\t 7\t 8\n9\t10\t11\t12")
        self.assertEqual(p.prettyPrint(column_limit=3), "1\t 2\t 3\n5\t 6\t 7\n9\t10\t11")
        self.assertEqual(
            p.prettyPrint(column_limit=3, include_header=True), "A\t B\t C\n1\t 2\t 3\n5\t 6\t 7\n9\t10\t11"
        )
        self.assertEqual(
            p.prettyPrint(column_limit=3, include_header=False, transpose_data=True),
            "1\t5\t 9\n2\t6\t10\n3\t7\t11\n4\t8\t12",
        )
        self.assertEqual(
            p.prettyPrint(column_limit=2, include_header=False, transpose_data=True), "1\t5\n2\t6\n3\t7\n4\t8"
        )
        self.assertEqual(
            p.prettyPrint(column_limit=3, include_header=True, transpose_data=True),
            "A\t1\t5\nB\t2\t6\nC\t3\t7\nD\t4\t8",
        )

    def test_reduce_wrong_size(self):
        """reduce: should fail when profiles have different sizes"""
        p1 = Profile(array([[1, 0], [0, 1]]), Alphabet="AB")
        p2 = Profile(array([[1, 0, 0], [1, 0, 0]]), Alphabet="ABC")
        self.assertRaises(ProfileError, p1.reduce, p2)

    def test_reduce_normalization_error(self):
        """reduce: fails when input or output can't be normalized"""
        # Will raise errors when input data can't be normalized
        self.assertRaises(ProfileError, self.empty.reduce, self.empty, add)
        self.assertRaises(ProfileError, self.full.reduce, self.empty_row, add)

        # don't normalize input, but do normalize output
        # fails when one row adds up to zero
        p1 = Profile(array([[3, 3], [4, 4]]), "AB")
        p2 = Profile(array([[3, 3], [-4, -4]]), "AB")
        self.assertRaises(ProfileError, p1.reduce, p2, add, False, True)

    def test_reduce_operators(self):
        """reduce: should work fine with different operators
        """
        # different operators, normalize input, don't normalize output
        p1 = Profile(array([[1, 0, 0], [0, 1, 0]]), Alphabet="ABC")
        p2 = Profile(array([[1, 0, 0], [0, 0, 1]]), Alphabet="ABC")

        self.assertEqual(p1.reduce(p2).Data, array([[1, 0, 0], [0, 0.5, 0.5]]))
        self.assertEqual(
            p1.reduce(p2, add, normalize_input=True, normalize_output=False).Data, array([[2, 0, 0], [0, 1, 1]])
        )
        self.assertEqual(
            p1.reduce(p2, subtract, normalize_input=True, normalize_output=False).Data, array([[0, 0, 0], [0, 1, -1]])
        )
        self.assertEqual(
            p1.reduce(p2, multiply, normalize_input=True, normalize_output=False).Data, array([[1, 0, 0], [0, 0, 0]])
        )

        self.assertRaises(ProfileError, p1.reduce, p2, divide, normalize_input=True, normalize_output=False)

        # don't normalize and normalize only input
        p3 = Profile(array([[1, 2], [3, 4]]), Alphabet="AB")
        p4 = Profile(array([[4, 3], [2, 1]]), Alphabet="AB")

        self.assertEqual(
            p3.reduce(p4, add, normalize_input=False, normalize_output=False).Data, array([[5, 5], [5, 5]])
        )
        self.assertFloatEqual(
            p3.reduce(p4, add, normalize_input=True, normalize_output=False).Data,
            array([[19 / 21, 23 / 21], [23 / 21, 19 / 21]]),
        )

        # normalize input and output
        p5 = Profile(array([[1, 1, 0, 0], [1, 1, 1, 1]]), Alphabet="ABCD")
        p6 = Profile(array([[1, 0, 0, 0], [1, 0, 0, 1]]), Alphabet="ABCD")

        self.assertEqual(
            p5.reduce(p6, add, normalize_input=True, normalize_output=True).Data,
            array([[0.75, 0.25, 0, 0], [0.375, 0.125, 0.125, 0.375]]),
        )

        # it can collapse empty profiles when normalizing is turned off
        self.assertEqual(
            self.empty.reduce(self.empty, normalize_input=False, normalize_output=False).Data.tolist(), [[]]
        )

        # more specific tests of the operators will be in the
        # separate functions

    def test__add_(self):
        """__add__: should not normalize input or output, just add"""
        p1 = Profile(array([[0.3, 0.4, 0.1, 0], [0.1, 0.1, 0.1, 0.7]]), Alphabet="ABCD")
        p2 = Profile(array([[1, 0, 0, 0], [1, 0, 0, 1]]), Alphabet="ABCD")
        self.assertEqual((p1 + p2).Data, array([[1.3, 0.4, 0.1, 0], [1.1, 0.1, 0.1, 1.7]]))
        self.assertRaises(ProfileError, self.empty.__add__, p1)
        self.assertEqual((self.empty + self.empty).Data.tolist(), [[]])

    def test__sub_(self):
        """__sub__: should subtract two profiles, no normalization"""
        p1 = Profile(array([[0.3, 0.4, 0.1, 0], [0.1, 0.1, 0.1, 0.7]]), Alphabet="ABCD")
        p2 = Profile(array([[1, 0, 0, 0], [1, 0, 0, 1]]), Alphabet="ABCD")
        self.assertFloatEqual((p1 - p2).Data, array([[-0.7, 0.4, 0.1, 0], [-0.9, 0.1, 0.1, -0.3]]))

    def test__mul_(self):
        """__mul__: should multiply two profiles, no normalization"""
        p1 = Profile(array([[1, -2, 3, 0], [1, 1, 1, 0.5]]), Alphabet="ABCD")
        p2 = Profile(array([[1, 0, 0, 0], [1, 0, 3, 2]]), Alphabet="ABCD")
        self.assertEqual((p1 * p2).Data, array([[1, 0, 0, 0], [1, 0, 3, 1]]))

    def test__div_(self):
        """__div__ and __truediv__: always true division b/c __future__.division
        """
        p1 = Profile(array([[2, 3], [4, 5]]), "AB")
        p2 = Profile(array([[1, 0], [4, 5]]), "AB")  # Int 0
        p3 = Profile(array([[1, 0.0], [4, 5]]), "AB")  # Float 0.0
        p4 = Profile(array([[1, 2], [8.0, 5]]), "AB")  # Float 0.0

        self.assertRaises(ProfileError, p1.__truediv__, p2)
        # infinity in result data
        self.assertRaises(ProfileError, p1.__div__, p3)
        self.assertFloatEqual((p1.__div__(p4)).Data, array([[2, 1.5], [0.5, 1]]))

    def test_distance(self):
        """distance: should return correct distance between the profiles
        """
        p1 = Profile(array([[2, 4], [3, 1]]), "AB")
        p2 = Profile(array([[4, 6], [5, 3]]), "AB")
        p3 = Profile(array([[4, 6], [5, 3], [1, 1]]), "AB")
        p4 = Profile(array([2, 2]), "AB")
        p5 = Profile(array([2, 2, 2]), "AB")
        p6 = Profile(array([[]]), "AB")

        self.assertEqual(p1.distance(p2), 4)
        self.assertEqual(p2.distance(p1), 4)
        self.assertEqual(p1.distance(p4), sqrt(6))
        self.assertEqual(p6.distance(p6), 0)

        # Raises error when frames are not aligned
        self.assertRaises(ProfileError, p1.distance, p3)
        self.assertRaises(ProfileError, p1.distance, p5)

    def test_toOddsMatrix(self):
        """toOddsMatrix: should work on valid data or raise an error
        """
        p = Profile(
            array(
                [
                    [0.1, 0.3, 0.5, 0.1],
                    [0.25, 0.25, 0.25, 0.25],
                    [0.05, 0.8, 0.05, 0.1],
                    [0.7, 0.1, 0.1, 0.1],
                    [0.6, 0.15, 0.05, 0.2],
                ]
            ),
            Alphabet="ACTG",
        )
        p_exp = Profile(
            array([[0.4, 1.2, 2, 0.4], [1, 1, 1, 1], [0.2, 3.2, 0.2, 0.4], [2.8, 0.4, 0.4, 0.4], [2.4, 0.6, 0.2, 0.8]]),
            Alphabet="ACTG",
        )
        self.assertEqual(p.toOddsMatrix().Data, p_exp.Data)
        assert p.Alphabet is p.toOddsMatrix().Alphabet
        self.assertEqual(p.toOddsMatrix([0.25, 0.25, 0.25, 0.25]).Data, p_exp.Data)

        # fails if symbol_freqs has wrong size
        self.assertRaises(ProfileError, p.toOddsMatrix, [0.25, 0.25, 0.25, 0.25, 0.25, 0.25])
        self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix, [0.1, 0.2, 0.3])
        # works on empty profile
        self.assertEqual(self.empty.toOddsMatrix().Data.tolist(), [[]])
        # works with different input
        self.assertEqual(self.zero_entry.toOddsMatrix().Data, array([[1.2, 0.8, 0, 2], [0, 0, 3.2, 0.8]]))
        self.assertFloatEqual(
            self.zero_entry.toOddsMatrix([0.1, 0.2, 0.3, 0.4]).Data, array([[3, 1, 0, 1.25], [0, 0, 2.667, 0.5]]), 1e-3
        )
        # fails when one of the background frequencies is 0
        self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix, [0.1, 0.2, 0.3, 0])

    def test_toLogOddsMatrix(self):
        """toLogOddsMatrix: should work as expected"""
        # This test can be short, because it mainly depends on toOddsMatrix
        # for which everything has been tested
        p = Profile(
            array(
                [
                    [0.1, 0.3, 0.5, 0.1],
                    [0.25, 0.25, 0.25, 0.25],
                    [0.05, 0.8, 0.05, 0.1],
                    [0.7, 0.1, 0.1, 0.1],
                    [0.6, 0.15, 0.05, 0.2],
                ]
            ),
            Alphabet="ACTG",
        )
        p_exp = Profile(
            array(
                [
                    [-1.322, 0.263, 1.0, -1.322],
                    [0.0, 0.0, 0.0, 0.0],
                    [-2.322, 1.678, -2.322, -1.322],
                    [1.485, -1.322, -1.322, -1.322],
                    [1.263, -0.737, -2.322, -0.322],
                ]
            ),
            Alphabet="ACTG",
        )
        self.assertFloatEqual(p.toLogOddsMatrix().Data, p_exp.Data, eps=1e-3)
        # works on empty matrix
        self.assertEqual(self.empty.toLogOddsMatrix().Data.tolist(), [[]])

    def test__score_indices(self):
        """_score_indices: should work on valid input"""
        self.assertEqual(self.score1._score_indices(array([0, 1, 1, 3, 0, 3]), offset=0), [6, 2, -3, 0])
        self.assertFloatEqual(
            self.score2._score_indices(array([3, 1, 2, 0, 2, 2, 3]), offset=0), [0.3, 1.4, 0.8, 1.4, 1.7]
        )
        self.assertFloatEqual(self.score2._score_indices(array([3, 1, 2, 0, 2, 2, 3]), offset=3), [1.4, 1.7])
        # Errors will be raised on invalid input. Errors are not handled
        # in this method. Validation of the input is done elsewhere
        self.assertRaises(IndexError, self.score2._score_indices, array([3, 1, 63, 0, 4, 2, 3]), offset=3)

    def test__score_profile(self):
        """_score_profile: should work on valid input"""
        p1 = Profile(
            array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0.5, 0.5], [0, 0, 0, 1], [0.25, 0.25, 0.25, 0.25]]), "TCAG"
        )
        p2 = Profile(
            array(
                [[0, 1, 0, 0], [0.2, 0, 0.8, 0], [0, 0, 0.5, 0.5], [1 / 3, 1 / 3, 0, 1 / 3], [0.25, 0.25, 0.25, 0.25]]
            ),
            "TCAG",
        )

        self.assertFloatEqual(self.score2._score_profile(p1, offset=0), [0.55, 1.25, 0.45])
        self.assertFloatEqual(self.score2._score_profile(p1, offset=2), [0.45])
        self.assertFloatEqual(self.score2._score_profile(p2, offset=0), [1.49, 1.043, 0.483], 1e-3)

        # Errors will be raised on invalid input. Errors are not handled
        # in this method. Validation of the input is done elsewhere
        # In this case you don't get an error, but for sure an unexpected
        # result
        self.assertFloatEqual(self.score2._score_profile(p1, offset=3).tolist(), [])

    def test_score_sequence(self):
        """score: should work correctly for Sequence as input
        """
        # works on normal valid data
        s1 = self.score1.score("ATTCAC", offset=0)
        self.assertEqual(s1, [6, 2, -3, 0])
        self.assertFloatEqual(self.score2.score("TCAAGT", offset=0), [0.5, 1.6, 1.7, 0.5])
        # works with different offset
        self.assertFloatEqual(self.score2.score("TCAAGT", offset=2), [1.7, 0.5])
        self.assertFloatEqual(self.score2.score("TCAAGT", offset=3), [0.5])
        # raises error on invalid offset
        self.assertRaises(ProfileError, self.score2.score, "TCAAGT", offset=4)
        # works on seq of minimal length
        self.assertFloatEqual(self.score2.score("AGT", offset=0), [0.5])
        # raises error when sequence is too short
        self.assertRaises(ProfileError, self.score2.score, "", offset=0)
        # raises error on empty profile
        self.assertRaises(ProfileError, self.empty.score, "ACGT")
        # raises error when sequence contains characters that
        # are not in the characterorder
        self.assertRaises(ProfileError, self.score2.score, "ACBRT")

    def test_score_sequence_object(self):
        """score: should work correctly on Sequence object as input
        """
        # DnaSequence object
        ds = self.score1.score(DNA.Sequence("ATTCAC"), offset=0)
        self.assertEqual(ds, [6, 2, -3, 0])
        # ModelSequence object
        ms = self.score1.score(ModelSequence("ATTCAC", Alphabet=DNA.Alphabet), offset=0)
        self.assertEqual(ms, [6, 2, -3, 0])

    def test_score_no_trans_table(self):
        """score: should work when no translation table is present
        """
        p = Profile(Data=array([[-1, 0, 1, 2], [-2, 2, 0, 0], [-3, 5, 1, 0]]), Alphabet=DNA, CharOrder="ATGC")
        # remove translation table
        del p.__dict__["_translation_table"]
        # then score the profile
        s1 = p.score(DNA.Sequence("ATTCAC"), offset=0)
        self.assertEqual(s1, [6, 2, -3, 0])

    def test_score_profile(self):
        """score: should work correctly for Profile as input
        """
        p1 = Profile(
            array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0.5, 0.5], [0, 0, 0, 1], [0.25, 0.25, 0.25, 0.25]]), "TCAG"
        )
        p2 = Profile(
            array(
                [[0, 1, 0, 0], [0.2, 0, 0.8, 0], [0, 0, 0.5, 0.5], [1 / 3, 1 / 3, 0, 1 / 3], [0.25, 0.25, 0.25, 0.25]]
            ),
            "TCAG",
        )
        p3 = Profile(array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1]]), "TCAG")
        p4 = Profile(array([[1, 0, 0, 0], [0, 1, 0, 0]]), "TCAG")
        p5 = Profile(array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1]]), "AGTC")

        # works on normal valid data
        self.assertFloatEqual(self.score2.score(p1, offset=0), [0.55, 1.25, 0.45])
        self.assertFloatEqual(self.score2.score(p2, offset=0), [1.49, 1.043, 0.483], 1e-3)
        # works with different offset
        self.assertFloatEqual(self.score2.score(p1, offset=1), [1.25, 0.45])
        self.assertFloatEqual(self.score2.score(p1, offset=2), [0.45])
        # raises error on invalid offset
        self.assertRaises(ProfileError, self.score2.score, p1, offset=3)
        # works on profile of minimal length
        self.assertFloatEqual(self.score2.score(p3, offset=0), [0.6])
        # raises error when profile is too short
        self.assertRaises(ProfileError, self.score2.score, p4, offset=0)
        # raises error on empty profile
        self.assertRaises(ProfileError, self.empty.score, p1)
        # raises error when character order doesn't match
        self.assertRaises(ProfileError, self.score2.score, p5)

    def test_rowUncertainty(self):
        """rowUncertainty: should handle full and empty profiles
        """
        p = Profile(array([[0.25, 0.25, 0.25, 0.25], [0.5, 0.5, 0, 0]]), "ABCD")
        self.assertEqual(p.rowUncertainty(), [2, 1])

        # for empty rows 0 is returned as the uncertainty
        self.assertEqual(self.empty.rowUncertainty().tolist(), [])
        p = Profile(array([[], [], []]), "")
        self.assertEqual(p.rowUncertainty().tolist(), [])
        # doesn't work on 1D array
        self.assertRaises(ProfileError, self.oned.rowUncertainty)

    def test_columnUncertainty(self):
        """columnUncertainty: should handle full and empty profiles
        """
        p = Profile(array([[0.25, 0.5], [0.25, 0.5], [0.25, 0], [0.25, 0]]), "AB")
        self.assertEqual(p.columnUncertainty(), [2, 1])
        # for empty cols nothing is returned as the uncertainty
        self.assertEqual(self.empty.columnUncertainty().tolist(), [])
        p = Profile(array([[], [], []]), "")
        self.assertEqual(p.columnUncertainty().tolist(), [])
        # doesn't work on 1D array
        self.assertRaises(ProfileError, self.oned.columnUncertainty)

    def test_rowDegeneracy(self):
        """rowDegneracy: should work as expected"""
        p1 = self.consensus
        p2 = self.not_same_value

        self.assertEqual(p1.rowDegeneracy(), [1, 1, 1, 2, 1])
        self.assertEqual(p1.rowDegeneracy(cutoff=0.5), [1, 1, 1, 2, 1])
        self.assertEqual(p1.rowDegeneracy(cutoff=0.75), [1, 2, 1, 3, 2])
        # when a row seems to add up to the cutoff value, it's not
        # always found because of floating point error. E.g. second row
        # in this example
        self.assertEqual(p1.rowDegeneracy(cutoff=1), [2, 4, 1, 4, 2])
        # when the cutoff can't be found, the number of columns in the
        # profile is returned (for each row)
        self.assertEqual(p1.rowDegeneracy(cutoff=1.5), [4, 4, 4, 4, 4])

        self.assertEqual(p2.rowDegeneracy(cutoff=0.95), [4, 2, 4, 1])
        self.assertEqual(p2.rowDegeneracy(cutoff=1.4), [4, 3, 4, 1])

        self.assertEqual(self.empty.rowDegeneracy(), [])

    def test_columnDegeneracy(self):
        """columnDegeneracy: shoudl work as expected"""
        p1 = self.consensus
        p1.Data = transpose(p1.Data)
        p2 = self.not_same_value
        p2.Data = transpose(p2.Data)
        p1d = p1.columnDegeneracy()
        self.assertEqual(p1d, [1, 1, 1, 2, 1])
        self.assertEqual(p1.columnDegeneracy(cutoff=0.5), [1, 1, 1, 2, 1])
        self.assertEqual(p1.columnDegeneracy(cutoff=0.75), [1, 2, 1, 3, 2])
        # when a row seems to add up to the cutoff value, it's not
        # always found because of floating point error. E.g. second row
        # in this example
        self.assertEqual(p1.columnDegeneracy(cutoff=1), [2, 4, 1, 4, 2])
        # when the cutoff can't be found, the number of rows in the
        # profile is returned (for each column)
        self.assertEqual(p1.columnDegeneracy(cutoff=1.5), [4, 4, 4, 4, 4])

        self.assertEqual(p2.columnDegeneracy(cutoff=0.95), [4, 2, 4, 1])
        self.assertEqual(p2.columnDegeneracy(cutoff=1.4), [4, 3, 4, 1])

        self.assertEqual(self.empty.columnDegeneracy(), [])

    def test_rowMax(self):
        """rowMax should return max value in each row"""
        p1 = self.consensus
        obs = p1.rowMax()
        self.assertEqual(obs, array([0.8, 0.7, 1, 0.4, 0.5]))

    def test_toConsensus(self):
        """toConsensus: should work with all the different options
        """
        p = self.consensus
        self.assertEqual(p.toConsensus(fully_degenerate=False), "AGGAT")
        self.assertEqual(p.toConsensus(fully_degenerate=True), "WVGNY")
        self.assertEqual(p.toConsensus(cutoff=0.75), "ARGHY")
        self.assertEqual(p.toConsensus(cutoff=0.95), "WVGNY")
        self.assertEqual(p.toConsensus(cutoff=2), "WVGNY")

        p = self.not_same_value
        self.assertEqual(p.toConsensus(fully_degenerate=False), "CGTA")
        self.assertEqual(p.toConsensus(fully_degenerate=True), "NBYA")
        self.assertEqual(p.toConsensus(cutoff=0.75), "YSYA")
        self.assertEqual(p.toConsensus(cutoff=2), "NBYA")
        self.assertEqual(p.toConsensus(cutoff=5), "NBYA")

        # when you specify both fully_generate and a cutoff value
        # the cutoff takes priority and is used in the calculation
        self.assertEqual(p.toConsensus(cutoff=0.75, fully_degenerate=True), "YSYA")

        # raises AttributeError when Alphabet doens't have Degenerates
        p = Profile(array([[0.2, 0.8], [0.7, 0.3]]), "AB")
        self.assertRaises(AttributeError, p.toConsensus, cutoff=0.5)

    def test_toConsensus_include_all(self):
        """toConsensus: Should include all possibilities when include_all=True
        """
        p1 = Profile(
            array([[0.2, 0, 0.8, 0], [0, 0.1, 0.2, 0.7], [0, 0, 0, 1], [0.2, 0.3, 0.4, 0.1], [0.5, 0.5, 0, 0]]),
            Alphabet=DNA,
            CharOrder="TCAG",
        )
        self.assertEqual(p1.toConsensus(cutoff=0.4, include_all=True), "AGGAY")
        p2 = Profile(
            array([[0.25, 0.25, 0.25, 0.25], [0.1, 0.1, 0.1, 0], [0.4, 0, 0.4, 0], [0, 0.2, 0.2, 0.3]]),
            Alphabet=DNA,
            CharOrder="TCAG",
        )
        self.assertEqual(p2.toConsensus(cutoff=0.4, include_all=True), "NHWV")

    def test_randomIndices(self):
        """randomIndices: 99% of new frequencies should be within 3*SD
        """
        r_num, c_num = 100, 20
        num_elements = r_num * c_num
        r = random([r_num, c_num])
        p = Profile(r, "A" * c_num)
        p.normalizePositions()
        d = p.Data
        n = 1000

        # Test only works on normalized profile, b/c of 1-d below
        means = n * d
        three_stds = sqrt(d * (1 - d) * n) * 3
        result = [p.randomIndices() for x in range(n)]
        a = Alignment(transpose(result))

        def absoluteProfile(alignment, char_order):
            f = a.columnFreqs()
            res = zeros([len(f), len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    res[row, ord(i)] = freq[i]
            return res

        ap = absoluteProfile(a, p.CharOrder)
        failure = abs(ap - means) > three_stds
        assert sum(sum(failure)) / num_elements <= 0.01

    def test_randomSequence(self):
        """randomSequence: 99% of new frequencies should be within 3*SD"""
        r_num, c_num = 100, 20
        num_elements = r_num * c_num
        alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        r = random([r_num, c_num])
        p = Profile(r, alpha[:c_num])
        p.normalizePositions()
        d = p.Data
        n = 1000

        # Test only works on normalized profile, b/c of 1-d below
        means = n * d
        three_stds = sqrt(d * (1 - d) * n) * 3

        a = Alignment([p.randomSequence() for x in range(n)])

        def absoluteProfile(alignment, char_order):
            f = a.columnFreqs()
            res = zeros([len(f), len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    col = char_order.index(i)
                    res[row, col] = freq[i]
            return res

        ap = absoluteProfile(a, p.CharOrder)
        failure = abs(ap - means) > three_stds
        assert sum(sum(failure)) / num_elements <= 0.01
예제 #52
0
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\
    weights=None):
    """Generates a Profile object from an Alignment.

    aln: Alignment object
    alphabet (optional): an Alphabet object (or list of chars, but if you 
        want to split degenerate symbols, the alphabet must have a 
        Degenerates property. Default is the alphabet of the first seq in 
        the alignment.
    char_order (optional): order of the characters in the profile. Default
        is list(alphabet)
    split_degenerates (optional): Whether you want the counts for the 
        degenerate symbols to be divided over the non-degenerate symbols they
        code for.
    weights (optional): dictionary of seq_id: weight. If not entered all seqs
        are weighted equally

    A Profile is a position x character matrix describing which characters
    occur at each position of an alignment. The Profile is always normalized,
    so it gives the probabilities of each character at each position.
    
    Ignoring chars: you can ignore characters in the alignment by not putting
    the char in the CharOrder. If you ignore all characters at a particular
    position, an error will be raised, because the profile can't be normalized.

    Splitting degenerates: you can split degenerate characters over the 
    non-degenerate characters they code for. For example: R = A or G. So,
    an R at a position counts for 0.5 A and 0.5 G.
   
    Example:
    seq1    TCAG    weight: 0.5
    seq2    TAR-    weight: 0.25
    seq3    YAG-    weight: 0.25
    Profile(aln,alphabet=DNA,char_order="TACG",weights=w,
    split_degenerates=True)
    Profile:
       T      A      C      G
    [[ 0.875  0.     0.125  0.   ]
     [ 0.     0.5    0.5    0.   ]
     [ 0.     0.625  0.     0.375]
     [ 0.     0.     0.     1.   ]]
    """

    if alphabet is None:
        alphabet = aln.values()[0].MolType
    if char_order is None:
        char_order = list(alphabet)
    if weights is None:
        weights = dict.fromkeys(aln.keys(), 1 / len(aln))

    char_meaning = CharMeaningProfile(alphabet, char_order,\
        split_degenerates)

    profiles = []
    for k, v in aln.items():
        idxs = array(str(v).upper(), 'c').view(UInt8)
        profiles.append(char_meaning.Data[idxs] * weights[k])
    s = reduce(add, profiles)

    result = Profile(s, alphabet, char_order)
    try:
        result.normalizePositions()
    except Exception, e:
        raise ValueError, e