Exemple #1
0
    def test_rowUncertainty(self):
        """rowUncertainty: should handle full and empty profiles
        """
        p = Profile(array([[.25, .25, .25, .25], [.5, .5, 0, 0]]), "ABCD")
        self.assertEqual(p.rowUncertainty(), [2, 1])

        #for empty rows 0 is returned as the uncertainty
        self.assertEqual(self.empty.rowUncertainty().tolist(), [])
        p = Profile(array([[], [], []]), "")
        self.assertEqual(p.rowUncertainty().tolist(), [])
        #doesn't work on 1D array
        self.assertRaises(ProfileError, self.oned.rowUncertainty)
Exemple #2
0
 def test_hasValidAttributes(self):
     """hasValidAttributes: should work for different alphabets/char orders
     """
     p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="BAC")
     #self.Data doesn't match len(CharOrder)
     self.assertEqual(p.hasValidAttributes(), False)
     p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="AX")
     #not all chars in CharOrder in Alphabet
     self.assertEqual(p.hasValidAttributes(), False)
     p = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="CB")
     #should be fine
     self.assertEqual(p.hasValidAttributes(), True)
Exemple #3
0
 def test_toConsensus_include_all(self):
     """toConsensus: Should include all possibilities when include_all=True
     """
     p1 = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\
         [.2,.3,.4,.1],[.5,.5,0,0]]),\
         Alphabet=DNA, CharOrder="TCAG")
     self.assertEqual(p1.toConsensus(cutoff=0.4, include_all=True),\
         "AGGAY")
     p2 = Profile(array([[.25,0.25,.25,0.25],[0.1,.1,.1,0],\
         [.4,0,.4,0],[0,.2,0.2,0.3]]),\
         Alphabet=DNA, CharOrder="TCAG")
     self.assertEqual(p2.toConsensus(cutoff=0.4,\
         include_all=True), "NHWV")
Exemple #4
0
    def test__div_(self):
        """__div__ and __truediv__: always true division b/c __future__.division
        """
        p1 = Profile(array([[2, 3], [4, 5]]), "AB")
        p2 = Profile(array([[1, 0], [4, 5]]), "AB")  #Int 0
        p3 = Profile(array([[1, 0.0], [4, 5]]), "AB")  #Float 0.0
        p4 = Profile(array([[1, 2], [8.0, 5]]), "AB")  #Float 0.0

        self.assertRaises(ProfileError, p1.__truediv__, p2)
        #infinity in result data
        self.assertRaises(ProfileError, p1.__div__, p3)
        self.assertFloatEqual((p1.__div__(p4)).Data, array([[2, 1.5], [0.5,
                                                                       1]]))
Exemple #5
0
    def test_toConsensus(self):
        """toConsensus: should work with all the different options
        """
        p = self.consensus
        self.assertEqual(p.toConsensus(fully_degenerate=False),"AGGAT")
        self.assertEqual(p.toConsensus(fully_degenerate=True),"WVGNY")
        self.assertEqual(p.toConsensus(cutoff=0.75),"ARGHY")
        self.assertEqual(p.toConsensus(cutoff=0.95),"WVGNY")
        self.assertEqual(p.toConsensus(cutoff=2),"WVGNY")

        p = self.not_same_value
        self.assertEqual(p.toConsensus(fully_degenerate=False),"CGTA")
        self.assertEqual(p.toConsensus(fully_degenerate=True),"NBYA")
        self.assertEqual(p.toConsensus(cutoff=0.75),"YSYA")
        self.assertEqual(p.toConsensus(cutoff=2),"NBYA")
        self.assertEqual(p.toConsensus(cutoff=5),"NBYA")

        #when you specify both fully_generate and a cutoff value
        #the cutoff takes priority and is used in the calculation
        self.assertEqual(p.toConsensus(cutoff=0.75,fully_degenerate=True),\
            "YSYA")

        #raises AttributeError when Alphabet doens't have Degenerates
        p = Profile(array([[.2,.8],[.7,.3]]),"AB")
        self.assertRaises(AttributeError,p.toConsensus,cutoff=.5)
Exemple #6
0
    def test_randomIndices(self):
        """randomIndices: 99% of new frequencies should be within 3*SD
        """
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        r = random([r_num,c_num])
        p = Profile(r,"A"*c_num)
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3
        result = [p.randomIndices() for x in range(n)]
        a = Alignment(transpose(result))

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    res[row, ord(i)] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
Exemple #7
0
    def test_randomSequence(self):
        """randomSequence: 99% of new frequencies should be within 3*SD"""
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        r = random([r_num,c_num])
        p = Profile(r,alpha[:c_num])
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3

        a = Alignment([p.randomSequence() for x in range(n)])

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    col = char_order.index(i)
                    res[row, col] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
Exemple #8
0
def pos_char_weights(alignment, order=DNA_ORDER):
    """Returns the contribution of each character at each position.

    alignment: Alignemnt object
    order: the order of characters in the profile (all observed chars
        in the alignment
    
    This function is used by the function position_based
    
    For example: 
    GYVGS
    GFDGF
    GYDGF
    GYQGG
    
        0       1       2       3       4       5   
    G   1/1*4                           1/1*4   1/3*1
    Y           1/2*3
    F           1/2*1                           1/3*2
    V                   1/3*1
    D                   1/3*2
    Q                   1/3*1
    S                                           1/3*1
    """
    counts = alignment.columnFreqs()
    a = zeros([len(order), alignment.SeqLen], Float64)
    for col, c in enumerate(counts):
        for char in c:
            a[order.index(char), col] = 1 / (len(c) * c[char])
    return Profile(a, Alphabet=order)
Exemple #9
0
    def test_copy(self):
        """copy: should act as expected while rebinding/modifying attributes
        """
        p = Profile(array([[1, 1], [.7, .3]]), {
            'A': 'A',
            'G': 'G',
            'R': 'AG'
        }, "AG")
        p_copy = p.copy()
        assert p.Data is p_copy.Data
        assert p.Alphabet is p_copy.Alphabet
        assert p.CharOrder is p_copy.CharOrder

        #modifying p.Data modifies p_copy.Data
        p.Data[1, 1] = 100
        assert p.Alphabet is p_copy.Alphabet

        #normalizing p.Data rebinds it, so p_copy.Data is unchanged
        p.normalizePositions()
        assert not p.Data is p_copy.Data

        #Adding something to the alphabet changes both p and p_copy
        p.Alphabet['Y'] = 'TC'
        assert p.Alphabet is p_copy.Alphabet

        #Rebinding the CharOrder does only change the original
        p.CharOrder = 'XX'
        assert not p.CharOrder is p_copy.CharOrder
Exemple #10
0
    def test_isValid(self):
        """isValid: should work as expected"""
        #everything valid
        p1 = Profile(array([[.3, .7], [.8, .2]]),
                     Alphabet="AB",
                     CharOrder="AB")
        #invalid data, valid attributes
        p2 = Profile(array([[1, 2], [3, 4]]), Alphabet="ABCD", CharOrder="BA")
        #invalid attributes, valid data
        p3 = Profile(array([[.3, .7], [.8, .2]]),
                     Alphabet="ABCD",
                     CharOrder="AF")

        self.assertEqual(p1.isValid(), True)
        self.assertEqual(p2.isValid(), False)
        self.assertEqual(p3.isValid(), False)
Exemple #11
0
    def test_pos_char_weights(self):
        """pos_char_weights: should return correct contributions at each pos
        """
        #build expected profile
        exp_data = zeros([len(PROTEIN_ORDER), self.aln2.SeqLen], Float64)
        exp = [{
            'G': 1 / 4
        }, {
            'Y': 1 / 6,
            'F': 1 / 2
        }, {
            'V': 1 / 3,
            'D': 1 / 6,
            'Q': 1 / 3
        }, {
            'G': 1 / 4
        }, {
            'G': 1 / 3,
            'F': 1 / 6,
            'S': 1 / 3
        }]
        for pos, weights in enumerate(exp):
            for k, v in weights.items():
                exp_data[PROTEIN_ORDER.index(k), pos] = v
        exp_aln2 = Profile(exp_data, Alphabet=PROTEIN_ORDER)

        #check observed against expected
        self.assertEqual(
            pos_char_weights(self.aln2, PROTEIN_ORDER).Data, exp_aln2.Data)
Exemple #12
0
 def test_toLogOddsMatrix(self):
     """toLogOddsMatrix: should work as expected"""
     #This test can be short, because it mainly depends on toOddsMatrix
     #for which everything has been tested
     p = Profile(array([[.1,.3,.5,.1],[.25,.25,.25,.25],\
         [.05,.8,.05,.1],[.7,.1,.1,.1],[.6,.15,.05,.2]]),\
         Alphabet="ACTG")
     p_exp = Profile(array(\
         [[-1.322, 0.263, 1., -1.322],\
          [ 0., 0., 0., 0.],\
          [-2.322,  1.678, -2.322, -1.322],\
          [ 1.485, -1.322, -1.322, -1.322],\
          [ 1.263, -0.737, -2.322, -0.322]]),\
          Alphabet="ACTG")
     self.assertFloatEqual(p.toLogOddsMatrix().Data,p_exp.Data,eps=1e-3) 
     #works on empty matrix
     self.assertEqual(self.empty.toLogOddsMatrix().Data.tolist(),[[]])
Exemple #13
0
 def test_dataAt(self):
     """dataAt: should work on valid position and character"""
     p = Profile(array([[.2,.4,.4,0],[.1,0,.9,0],[.1,.2,.3,.4]]),\
         Alphabet="TCAG")
     self.assertEqual(p.dataAt(0,'C'),.4)
     self.assertEqual(p.dataAt(1,'T'),.1)
     self.assertRaises(ProfileError, p.dataAt, 1, 'U')
     self.assertRaises(ProfileError, p.dataAt, -2, 'T')
     self.assertRaises(ProfileError, p.dataAt, 5, 'T')
Exemple #14
0
 def test_score_no_trans_table(self):
     """score: should work when no translation table is present
     """
     p = Profile(Data=array([[-1,0,1,2],[-2,2,0,0],[-3,5,1,0]]),\
         Alphabet=DNA, CharOrder="ATGC")
     # remove translation table
     del p.__dict__['_translation_table']
     # then score the profile
     s1 = p.score(DNA.Sequence("ATTCAC"),offset=0)
     self.assertEqual(s1, [6,2,-3,0])
Exemple #15
0
    def test__score_profile(self):
        """_score_profile: should work on valid input"""
        p1 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,.5,.5],[0,0,0,1],\
            [.25,.25,.25,.25]]),"TCAG")
        p2 = Profile(array([[0,1,0,0],[.2,0,.8,0],[0,0,.5,.5],[1/3,1/3,0,1/3],\
            [.25,.25,.25,.25]]),"TCAG")

        self.assertFloatEqual(self.score2._score_profile(p1,offset=0),\
            [.55,1.25,.45])
        self.assertFloatEqual(self.score2._score_profile(p1,offset=2),\
            [.45])
        self.assertFloatEqual(self.score2._score_profile(p2,offset=0),\
            [1.49,1.043,.483],1e-3)

        #Errors will be raised on invalid input. Errors are not handled
        #in this method. Validation of the input is done elsewhere
        #In this case you don't get an error, but for sure an unexpected
        #result
        self.assertFloatEqual(self.score2._score_profile(p1,offset=3).tolist(),\
            [])
Exemple #16
0
def mVOR(alignment, n=1000, order=DNA_ORDER):
    """Returns sequence weights according to the modified Voronoi method.
    
    alignment: Alignment object
    n: sample size (=number of random profiles to be generated)
    order: specifies the order of the characters found in the alignment,
        used to build the sequence and random profiles.
    
    mVOR is a modification of the VOR method. Instead of generating discrete
    random sequences, it generates random profiles, to sample more equally from
    the sequence space and to prevent random sequences to be equidistant to 
    multiple sequences in the alignment. 

    See the Implementation notes to see how the random profiles are generated
    and compared to the 'sequence profiles' from the alignment.

    Random generalized sequences (or a profile filled with random numbers):
    Sequences that are equidistant to multiple sequences in the alignment
    can form a problem in small datasets. For longer sequences the likelihood
    of this event is negligable. Generating 'random generalized sequences' is 
    a solution, because we're then sampling from continuous sequence space. 
    Each column of a random profile is generated by normalizing a set of 
    independent, exponentially distributed random numbers. In other words, a 
    random profile is a two-dimensional array (rows are chars in the alphabet, 
    columns are positions in the alignment) filled with a random numbers, 
    sampled from the standard exponential distribution (lambda=1, and thus 
    the mean=1), where each column is normalized to one. These random profiles 
    are compared to the special profiles of just one sequence (ones for the 
    single character observed at that position). The distance between the 
    two profiles is simply the Euclidean distance.

    """

    weights = zeros(len(alignment.Names), Float64)

    #get seq profiles
    seq_profiles = {}
    for k, v in alignment.items():
        #seq_profiles[k] = ProfileFromSeq(v,order=order)
        seq_profiles[k] = SeqToProfile(v, alphabet=order)

    for count in range(n):
        #generate a random profile
        exp = exponential(1, [alignment.SeqLen, len(order)])
        r = Profile(Data=exp, Alphabet=order)
        r.normalizePositions()
        #append the distance between the random profile and the sequence
        #profile to temp
        temp = [seq_profiles[key].distance(r) for key in alignment.Names]
        votes = row_to_vote(array(temp))
        weights += votes
    weight_dict = Weights(dict(zip(alignment.Names, weights)))
    weight_dict.normalize()
    return weight_dict
Exemple #17
0
    def test_normalizePositions(self):
        """normalizePositions: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizePositions()
        self.assertEqual(p.Data,array([[2/6,4/6],[3/8,5/8],[4/12,8/12]]))
        self.assertEqual(sum(p.Data,1),[1,1,1])
        p = self.empty_col.copy()
        p.normalizePositions()
        self.assertEqual(p.Data,array([[0,1],[0,1]]))
        p = self.empty_row.copy()
        self.assertRaises(ProfileError,p.normalizePositions)
        p = Profile(array([[0.0,0.0]]),"AB")
        self.assertRaises(ProfileError,p.normalizePositions)

        #negative numbers!!!!!!
        p1 = Profile(array([[3,-2],[4,-3]]),"AB")
        p1.normalizePositions()
        self.assertEqual(p1.Data,array([[3,-2],[4,-3]]))
        p2 = Profile(array([[3,-3],[4,-3]]),"AB")
        self.assertRaises(ProfileError,p2.normalizePositions)
Exemple #18
0
    def test_normalizeSequences(self):
        """normalizeSequences: should normalize or raise appropriate error
        """
        p = self.full.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data,array([[2/9,4/17],[3/9,5/17],[4/9,8/17]]))
        self.assertEqual(sum(p.Data, axis=0),[1,1])
        p = self.empty_row.copy()
        p.normalizeSequences()
        self.assertEqual(p.Data,array([[1,1],[0,0]]))
        p = self.empty_col.copy()
        self.assertRaises(ProfileError,p.normalizeSequences)
        p = Profile(array([[0.0],[0.0]]),"AB")
        self.assertRaises(ProfileError,p.normalizeSequences)

        #negative numbers!!!!!!
        p1 = Profile(array([[3,4],[-2,-3]]),"AB")
        p1.normalizeSequences()
        self.assertEqual(p1.Data,array([[3,4],[-2,-3]]))
        p2 = Profile(array([[3,4],[-3,-3]]),"AB")
        self.assertRaises(ProfileError,p2.normalizeSequences)
Exemple #19
0
 def test_init(self):
     """__init__: should set all attributed correctly"""
     self.assertRaises(TypeError, Profile)
     self.assertRaises(TypeError, Profile, array([[2,3]]))
     #only alphabet
     p = Profile(array([[.2,.8],[.7,.3]]),"AB")
     self.assertEqual(p.Data, [[.2,.8],[.7,.3]])
     self.assertEqual(p.Alphabet, "AB")
     self.assertEqual(p.CharOrder, list("AB"))
     self.assertEqual(translate("ABBA",p._translation_table),
         "\x00\x01\x01\x00")
     #alphabet and char order
     p = Profile(array([[.1,.2],[.4,.3]]),Alphabet=DNA,
         CharOrder="AG")
     self.assertEqual(p.CharOrder,"AG")
     assert p.Alphabet is DNA
     #non-character alphabet        
     p = Profile(array([[.1,.2],[.4,.3]]),Alphabet=[7,3],
         CharOrder=[3,7])
     self.assertEqual(p.CharOrder,[3,7])
     self.assertEqual(p.Alphabet, [7,3])
     self.assertEqual(p.Data, [[.1,.2],[.4,.3]])
Exemple #20
0
def freqs_from_aln_array(seqs):
    """Returns per-position freqs from arbitrary size alignment.

    Warning: fails if all seqs aren't the same length.
    written by Rob Knight
    
    seqs = list of lines from aligned fasta file
    """
    result = None
    for label, seq in MinimalFastaParser(seqs):
        # Currently cogent does not support . characters for gaps, converting
        # to - characters for compatability.
        seq = ModelDnaSequence(seq.replace('.','-'))
        if result is None:
            result = zeros((len(seq.Alphabet), len(seq)),dtype=int)
            indices = arange(len(seq), dtype=int)
        result[seq._data,indices] += 1
    return Profile(result, seq.Alphabet)
Exemple #21
0
    def test_distance(self):
        """distance: should return correct distance between the profiles
        """
        p1 = Profile(array([[2,4],[3,1]]), "AB")
        p2 = Profile(array([[4,6],[5,3]]), "AB")
        p3 = Profile(array([[4,6],[5,3],[1,1]]), "AB")
        p4 = Profile(array([2,2]),"AB")
        p5 = Profile(array([2,2,2]),"AB")
        p6 = Profile(array([[]]),"AB")

        self.assertEqual(p1.distance(p2),4)
        self.assertEqual(p2.distance(p1),4)
        self.assertEqual(p1.distance(p4),sqrt(6))
        self.assertEqual(p6.distance(p6),0)
        
        #Raises error when frames are not aligned
        self.assertRaises(ProfileError, p1.distance,p3)
        self.assertRaises(ProfileError,p1.distance,p5)
Exemple #22
0
    def test_reduce_operators(self):
        """reduce: should work fine with different operators
        """
        #different operators, normalize input, don't normalize output
        p1 = Profile(array([[1,0,0],[0,1,0]]),Alphabet="ABC")
        p2 = Profile(array([[1,0,0],[0,0,1]]),Alphabet="ABC")

        self.assertEqual(p1.reduce(p2).Data,array([[1,0,0],[0,.5,.5]]))
        self.assertEqual(p1.reduce(p2,add,normalize_input=True,\
            normalize_output=False).Data,array([[2,0,0],[0,1,1]]))
        self.assertEqual(p1.reduce(p2,subtract,normalize_input=True,\
            normalize_output=False).Data,array([[0,0,0],[0,1,-1]]))
        self.assertEqual(p1.reduce(p2,multiply,normalize_input=True,\
            normalize_output=False).Data,array([[1,0,0],[0,0,0]]))
        
        self.assertRaises(ProfileError,p1.reduce,p2,divide,\
            normalize_input=True,normalize_output=False)

        #don't normalize and normalize only input
        p3 = Profile(array([[1,2],[3,4]]),Alphabet="AB")
        p4 = Profile(array([[4,3],[2,1]]),Alphabet="AB")
        
        self.assertEqual(p3.reduce(p4,add,normalize_input=False,\
            normalize_output=False).Data,array([[5,5],[5,5]]))
        self.assertFloatEqual(p3.reduce(p4,add,normalize_input=True,\
            normalize_output=False).Data,array([[19/21,23/21],[23/21,19/21]]))

        #normalize input and output
        p5 = Profile(array([[1,1,0,0],[1,1,1,1]]),Alphabet="ABCD")
        p6 = Profile(array([[1,0,0,0],[1,0,0,1]]),Alphabet="ABCD")

        self.assertEqual(p5.reduce(p6,add,normalize_input=True,\
            normalize_output=True).Data,array([[.75,.25,0,0],\
            [.375,.125,.125,.375]]))
     
        #it can collapse empty profiles when normalizing is turned off
        self.assertEqual(self.empty.reduce(self.empty,\
            normalize_input=False,normalize_output=False).Data.tolist(),[[]])
Exemple #23
0
 def test__sub_(self):
     """__sub__: should subtract two profiles, no normalization"""
     p1 = Profile(array([[.3,.4,.1,0],[.1,.1,.1,.7]]),Alphabet="ABCD")
     p2 = Profile(array([[1,0,0,0],[1,0,0,1]]),Alphabet="ABCD")
     self.assertFloatEqual((p1-p2).Data, array([[-.7,.4,.1,0],\
         [-.9,.1,.1,-.3]]))
Exemple #24
0
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\
    weights=None):
    """Generates a Profile object from an Alignment.

    aln: Alignment object
    alphabet (optional): an Alphabet object (or list of chars, but if you 
        want to split degenerate symbols, the alphabet must have a 
        Degenerates property. Default is the alphabet of the first seq in 
        the alignment.
    char_order (optional): order of the characters in the profile. Default
        is list(alphabet)
    split_degenerates (optional): Whether you want the counts for the 
        degenerate symbols to be divided over the non-degenerate symbols they
        code for.
    weights (optional): dictionary of seq_id: weight. If not entered all seqs
        are weighted equally

    A Profile is a position x character matrix describing which characters
    occur at each position of an alignment. The Profile is always normalized,
    so it gives the probabilities of each character at each position.
    
    Ignoring chars: you can ignore characters in the alignment by not putting
    the char in the CharOrder. If you ignore all characters at a particular
    position, an error will be raised, because the profile can't be normalized.

    Splitting degenerates: you can split degenerate characters over the 
    non-degenerate characters they code for. For example: R = A or G. So,
    an R at a position counts for 0.5 A and 0.5 G.
   
    Example:
    seq1    TCAG    weight: 0.5
    seq2    TAR-    weight: 0.25
    seq3    YAG-    weight: 0.25
    Profile(aln,alphabet=DNA,char_order="TACG",weights=w,
    split_degenerates=True)
    Profile:
       T      A      C      G
    [[ 0.875  0.     0.125  0.   ]
     [ 0.     0.5    0.5    0.   ]
     [ 0.     0.625  0.     0.375]
     [ 0.     0.     0.     1.   ]]
    """

    if alphabet is None:
        alphabet = aln.values()[0].MolType
    if char_order is None:
        char_order = list(alphabet)
    if weights is None:
        weights = dict.fromkeys(aln.keys(), 1 / len(aln))

    char_meaning = CharMeaningProfile(alphabet, char_order,\
        split_degenerates)

    profiles = []
    for k, v in aln.items():
        idxs = array(str(v).upper(), 'c').view(UInt8)
        profiles.append(char_meaning.Data[idxs] * weights[k])
    s = reduce(add, profiles)

    result = Profile(s, alphabet, char_order)
    try:
        result.normalizePositions()
    except Exception, e:
        raise ValueError, e
Exemple #25
0
def SeqToProfile(seq, alphabet=None, char_order=None,\
    split_degenerates=False):
    """Generates a Profile object from a Sequence object.

    seq: Sequence object
    alphabet (optional): Alphabet object (if you want to split
        degenerate symbols, the alphabet object should have a 
        Degenerates property. Default is the Alphabet associated with 
        the Sequence object.
    char_order (optional): The order the characters occur in the Profile.
        Default is the list(alphabet)
    split_degenerates (optional): Whether you want the counts for the 
        degenerate symbols to be divided over the non-degenerate symbols they
        code for.
    
    A Profile is a position x character matrix describing which characters
    occur at each position. In a sequence (as opposed to an alignment) only
    one character occurs at each position. In general a sequence profile
    will only contain ones and zeros. However, you have the possibility of 
    splitting degenerate characters. For example, if a position is R, it 
    means that you have 50/50% chance of A and G. It's also possible to 
    ignore characters, which in a sequence profile will lead to positions
    (rows) containing only zeros.
    
    Example:
    Sequence = ACGU
    Profile(seq, CharOrder=UCAG):
    U   C   A   G
    0   0   1   0   first pos
    0   1   0   0   second pos
    0   0   0   1   third pos
    1   0   0   0   fourth pos

    Sequence= GURY
    Profile(seq,CharOrder=UCAG, split_degenerates=True)
    U   C   A   G
    0   0   0   1   first pos
    1   0   0   0   second pos
    0   0   .5  .5  third pos
    .5  .5  0   0   fourth pos

    Characters can also be ignored
    Sequence = ACN-
    Profile(seq, CharOrder=UCAGN, split_degenerates=True)
    U   C   A   G
    0   0   1   0   first pos
    0   1   0   0   second pos
    .25 .25 .25 .25 third pos
    0   0   0   0   fourth pos <--contains only zeros
    """

    if alphabet is None:
        alphabet = seq.MolType
    if char_order is None:
        char_order = list(alphabet)

    #Determine the meaning of each character based on the alphabet, the
    #character order, and the option to split degenerates
    char_meaning = CharMeaningProfile(alphabet, char_order,\
        split_degenerates)
    #construct profile data
    idxs = array(str(seq).upper(), 'c').view(UInt8)
    result_data = char_meaning.Data[idxs]
    #result_data = take(char_meaning.Data, asarray(str(seq).upper(), UInt8), axis=0)

    return Profile(result_data, alphabet, char_order)
Exemple #26
0
from cogent.core.profile import Profile
from cogent import LoadSeqs, RNA
aln = LoadSeqs("data/trna_profile.fasta", moltype=RNA)
print len(aln.Seqs)
print len(aln)
pf = aln.getPosFreqs()
print pf.prettyPrint(include_header=True, column_limit=6, col_sep='   ')
pf.normalizePositions()
print pf.prettyPrint(include_header=True, column_limit=6, col_sep='   ')
print pf.isValid()
print '\n'.join([
    '%s: %.3f' % (c, f) for (c, f) in zip(pf.CharOrder, pf.dataAt(4)) if f != 0
])
print pf.toConsensus(fully_degenerate=False)
pf.Alphabet = RNA
print "to consensus"
print pf.toConsensus(fully_degenerate=True)
print pf.toConsensus(cutoff=0.8)
print pf.toConsensus(cutoff=0.6)
loop_profile = Profile(pf.Data[54:60, :], Alphabet=RNA, CharOrder=pf.CharOrder)
print loop_profile.prettyPrint(include_header=True,
                               column_limit=6,
                               col_sep='   ')
yeast = RNA.Sequence(
    'GCGGAUUUAGCUCAGUU-GGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA'
)
scores = loop_profile.score(yeast)
print scores
print max(scores)
print scores.argmax()
Exemple #27
0
 def setUp(self):
     """setUp method for all Profile tests"""
     self.full = Profile(array([[2,4],[3,5],[4,8]]),"AB")
     self.empty = Profile(array([[]]),"AB")
     self.empty_row = Profile(array([[1,1],[0,0]]), "AB")
     self.empty_col = Profile(array([[0,1],[0,1]]), "AB")
     self.consensus = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\
         [.2,.3,.4,.1],[.5,.5,0,0]]),\
         Alphabet=DNA, CharOrder="TCAG")
     self.not_same_value = Profile(array([[.3,.5,.1,.1],[.4,.6,0,.7],\
         [.3,.2,0,0],[0,0,4,0]]),Alphabet=DNA, CharOrder="TCAG")
     self.zero_entry = Profile(array([[.3,.2,0,.5],[0,0,.8,.2]]),\
         Alphabet="UCAG")
     self.score1 = Profile(Data=array([[-1,0,1,2],[-2,2,0,0],[-3,5,1,0]]),\
         Alphabet=DNA, CharOrder="ATGC")
     self.score2 = Profile(array([[.2,.4,.4,0],[.1,0,.9,0],[.1,.2,.3,.4]]),\
         Alphabet="TCAG")
     self.oned = Profile(array([.25,.25,.25,.25]),"ABCD")
     self.pp = Profile(array([[1,2,3,4],[5,6,7,8],[9,10,11,12]]),"ABCD")
Exemple #28
0
 def test_make_translation_table(self):
     """_make_translation_table: should return correct table from char order
     """
     p = Profile(array([[.2,.8],[.7,.3]]),"ABCDE","AB")
     self.assertEqual(translate("ABBA",p._translation_table),
         "\x00\x01\x01\x00")
Exemple #29
0
 def test__mul_(self):
     """__mul__: should multiply two profiles, no normalization"""
     p1 = Profile(array([[1,-2,3,0],[1,1,1,.5]]),Alphabet="ABCD")
     p2 = Profile(array([[1,0,0,0],[1,0,3,2]]),Alphabet="ABCD")
     self.assertEqual((p1*p2).Data, array([[1,0,0,0],\
         [1,0,3,1]]))
Exemple #30
0
 def test_reduce_wrong_size(self):
     """reduce: should fail when profiles have different sizes"""
     p1 = Profile(array([[1,0],[0,1]]),Alphabet="AB")
     p2 = Profile(array([[1,0,0],[1,0,0]]),Alphabet="ABC")
     self.assertRaises(ProfileError,p1.reduce,p2)