Example #1
0
    def test_SeqToProfile(self):
        """SequenceToProfile: should work with different parameter settings
        """
        seq = DnaSequence("ATCGRYN-")

        #Only non-degenerate bases in the char order, all other
        #characters are ignored. In a sequence this means that
        #several positions will contain only zeros in the profile.
        exp = zeros([len(seq), 4], Float64)
        for x, y in zip(range(len(seq)), [2, 0, 1, 3]):
            exp[x, y] = 1
        self.assertEqual(SeqToProfile(seq,char_order="TCAG",\
            split_degenerates=False).Data.tolist(),exp.tolist())

        #Same thing should work as well when the char order is not passed in
        exp = zeros([len(seq), 4], Float64)
        for x, y in zip(range(len(seq)), [2, 0, 1, 3]):
            exp[x, y] = 1
        self.assertEqual(SeqToProfile(seq, split_degenerates=False)\
            .Data.tolist(),exp.tolist())

        #All symbols in the sequence are in the char order, no row
        #should contain only zeros. Degenerate symbols are not split.
        exp = zeros([len(seq), 8], Float64)
        for x, y in zip(range(len(seq)), [2, 0, 1, 3, 4, 5, 6, 7]):
            exp[x, y] = 1
        self.assertEqual(SeqToProfile(seq,char_order="TCAGRYN-",\
            split_degenerates=False).Data.tolist(), exp.tolist())

        #splitting all degenerate symbols, having only non-degenerate symbols
        #in the character order (and -)
        exp = array([[0, 0, 1, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0, 0],
                     [0, 0, 0, 1, 0], [0, 0, .5, .5, 0], [.5, .5, 0, 0, 0],
                     [.25, .25, .25, .25, 0], [0, 0, 0, 0, 1]])
        self.assertEqual(SeqToProfile(seq,char_order="TCAG-",\
            split_degenerates=True).Data.tolist(),exp.tolist())

        #splitting degenerates, but having one of the degenerate
        #symbols in the character order. In that case the degenerate symbol
        #is not split.
        exp = array([[0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0],
                     [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0],
                     [0, 0, .5, .5, 0, 0], [.5, .5, 0, 0, 0, 0],
                     [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
        self.assertEqual(SeqToProfile(seq,char_order="TCAGN-",\
            split_degenerates=True).Data.tolist(),exp.tolist())
Example #2
0
def mVOR(alignment, n=1000, order=DNA_ORDER):
    """Returns sequence weights according to the modified Voronoi method.
    
    alignment: Alignment object
    n: sample size (=number of random profiles to be generated)
    order: specifies the order of the characters found in the alignment,
        used to build the sequence and random profiles.
    
    mVOR is a modification of the VOR method. Instead of generating discrete
    random sequences, it generates random profiles, to sample more equally from
    the sequence space and to prevent random sequences to be equidistant to 
    multiple sequences in the alignment. 

    See the Implementation notes to see how the random profiles are generated
    and compared to the 'sequence profiles' from the alignment.

    Random generalized sequences (or a profile filled with random numbers):
    Sequences that are equidistant to multiple sequences in the alignment
    can form a problem in small datasets. For longer sequences the likelihood
    of this event is negligable. Generating 'random generalized sequences' is 
    a solution, because we're then sampling from continuous sequence space. 
    Each column of a random profile is generated by normalizing a set of 
    independent, exponentially distributed random numbers. In other words, a 
    random profile is a two-dimensional array (rows are chars in the alphabet, 
    columns are positions in the alignment) filled with a random numbers, 
    sampled from the standard exponential distribution (lambda=1, and thus 
    the mean=1), where each column is normalized to one. These random profiles 
    are compared to the special profiles of just one sequence (ones for the 
    single character observed at that position). The distance between the 
    two profiles is simply the Euclidean distance.

    """

    weights = zeros(len(alignment.Names), Float64)

    #get seq profiles
    seq_profiles = {}
    for k, v in alignment.items():
        #seq_profiles[k] = ProfileFromSeq(v,order=order)
        seq_profiles[k] = SeqToProfile(v, alphabet=order)

    for count in range(n):
        #generate a random profile
        exp = exponential(1, [alignment.SeqLen, len(order)])
        r = Profile(Data=exp, Alphabet=order)
        r.normalizePositions()
        #append the distance between the random profile and the sequence
        #profile to temp
        temp = [seq_profiles[key].distance(r) for key in alignment.Names]
        votes = row_to_vote(array(temp))
        weights += votes
    weight_dict = Weights(dict(zip(alignment.Names, weights)))
    weight_dict.normalize()
    return weight_dict