Exemple #1
0
    def test_init_one_parameter(self):
        """PairFrequency should interpret single parameter as pair probs"""
        obs = PairFrequency('UCCC')
        exp = Freqs({('U','U'):0.0625, ('U','C'):0.1875, 
                      ('C','U'):0.1875, ('C','C'):0.5625})
        
        for k, v in exp.items():
            self.assertEqual(v, obs[k])
        for k, v in obs.items():
            if k not in exp:
                self.assertEqual(v, 0)

        self.assertEqual(PairFrequency('UCCC', [('U','U'),('C','C')]), \
            Freqs({('U','U'):0.1, ('C','C'):0.9}))
        #check that the alphabets are right: should not raise error on
        #incrementing characters already there, but should raise KeyError
        #on anything that's missing.
        p = PairFrequency('UCCC')
        p[('U','U')] += 1
        try:
            p[('X','U')] += 1
        except KeyError:
            pass
        else:
            raise AssertionError, "Expected KeyError."
        p = PairFrequency('UCCC', (('C','C'),))
        p[('C','C')] += 1
        try:
            p[('U','U')] += 1
        except KeyError:
            pass
        else:
            raise AssertionError, "Expected KeyError."
Exemple #2
0
def MagePointFromBaseFreqs(freqs, get_label=None, get_color=None, \
    get_radius=None):
    """Returns a MagePoint from an object with counts for the bases.
    
    get_label should be a function that calculates a label from the freqs.
    If get_label is not supplied, checks freqs.Label, freqs.Species, freqs.Id,
    freqs.Accession, and freqs.Name in that order. If get_label fails or none
    of the attributes is found, no label is written.

    get_color should be a function that calculates a color from the freqs. 
    Default is no color (i.e. the point has the color for the series), which
    will also happen if get_color fails.

    get_radius is similar to get_color.
    """
    label = None
    if get_label:
        try:
            label = get_label(freqs)
        except:
            pass    #label will be assigned None below
    else:
        for attr in ['Label', 'Species', 'Id', 'Accession', 'Name']:
            if hasattr(freqs, attr):
                label = getattr(freqs, attr)
                #keep going if the label is empty
                if label is not None and label != '':
                    break
    if not label and label != 0:
        label = None
    if get_color:
        try:
            color = get_color(freqs)
        except:
            color=None
    else:
        if hasattr(freqs, 'Color'):
            color = freqs.Color
        else:
            color = None
            
    if get_radius:
        try:
            radius = get_radius(freqs)
        except:
            radius=None
    else:
        if hasattr(freqs, 'Radius'):
            try:
                radius = float(freqs.Radius)
            except:
                radius = None
        else:
            radius = None
            
    relevant = Freqs({'A':freqs.get('A',0), 'C':freqs.get('C',0), 
        'G':freqs.get('G',0), 'U':freqs.get('U',0) or  freqs.get('T',0)})
    relevant.normalize()
    return MagePoint((relevant['A'],relevant['C'],relevant['G']), Label=label,\
        Color=color, Radius=radius)
Exemple #3
0
 def test_known_vals(self):
     """Composition should return precalculated elements for known cases"""
     self.assertEqual(len(Composition(5,1,"ACGU")), 969) 
     self.assertEqual(len(Composition(5,0,"ACGU")), 1771)
     as_list = list(Composition(5,1,"ACGU"))
     self.assertEqual(as_list[0], Freqs('A'*17+'CGU'))
     self.assertEqual(as_list[-1], Freqs('U'*17+'ACG'))
Exemple #4
0
def MagePointFromBaseFreqs(freqs, get_label=None, get_color=None, \
    get_radius=None):
    """Returns a MagePoint from an object with counts for the bases.
    
    get_label should be a function that calculates a label from the freqs.
    If get_label is not supplied, checks freqs.Label, freqs.Species, freqs.Id,
    freqs.Accession, and freqs.Name in that order. If get_label fails or none
    of the attributes is found, no label is written.

    get_color should be a function that calculates a color from the freqs. 
    Default is no color (i.e. the point has the color for the series), which
    will also happen if get_color fails.

    get_radius is similar to get_color.
    """
    label = None
    if get_label:
        try:
            label = get_label(freqs)
        except:
            pass    #label will be assigned None below
    else:
        for attr in ['Label', 'Species', 'Id', 'Accession', 'Name']:
            if hasattr(freqs, attr):
                label = getattr(freqs, attr)
                #keep going if the label is empty
                if label is not None and label != '':
                    break
    if not label and label != 0:
        label = None
    if get_color:
        try:
            color = get_color(freqs)
        except:
            color=None
    else:
        if hasattr(freqs, 'Color'):
            color = freqs.Color
        else:
            color = None
            
    if get_radius:
        try:
            radius = get_radius(freqs)
        except:
            radius=None
    else:
        if hasattr(freqs, 'Radius'):
            try:
                radius = float(freqs.Radius)
            except:
                radius = None
        else:
            radius = None
            
    relevant = Freqs({'A':freqs.get('A',0), 'C':freqs.get('C',0), 
        'G':freqs.get('G',0), 'U':freqs.get('U',0) or  freqs.get('T',0)})
    relevant.normalize()
    return MagePoint((relevant['A'],relevant['C'],relevant['G']), Label=label,\
        Color=color, Radius=radius)
Exemple #5
0
def kendalls_tau(x, y, return_p=True):
    """returns kendall's tau
    
    Arguments:
        - return_p: returns the probability from the normal approximation when
          True, otherwise just returns tau"""
    ranked = as_paired_ranks(x, y)
    n = len(ranked)
    denom = n * (n - 1) / 2
    con = 0
    discor = 0
    x_tied = 0
    y_tied = 0
    for i in range(n - 1):
        x_1 = ranked[i][0]
        y_1 = ranked[i][1]
        for j in range(i + 1, n):
            x_2 = ranked[j][0]
            y_2 = ranked[j][1]
            x_diff = x_1 - x_2
            y_diff = y_1 - y_2
            if x_diff * y_diff > 0:
                con += 1
            elif x_diff and y_diff:
                discor += 1
            else:
                if x_diff:
                    y_tied += 1
                if y_diff:
                    x_tied += 1

    diff = con - discor
    total = con + discor
    denom = ((total + y_tied) * (total + x_tied))**0.5
    variance = (4 * n + 10) / (9 * n * (n - 1))
    tau = diff / denom
    stat = tau

    if x_tied or y_tied:
        x_tied = array([v for v in Freqs(x).itervalues() if v > 1])
        y_tied = array([v for v in Freqs(y).itervalues() if v > 1])
        t0 = n * (n - 1) / 2
        t1 = sum(x_tied * (x_tied - 1)) / 2
        t2 = sum(y_tied * (y_tied - 1)) / 2
        stat = tau * sqrt((t0 - t1) * (t0 - t2))
        v0 = n * (n - 1) * (2 * n + 5)
        vt = sum(x_tied * (x_tied - 1) * (2 * x_tied + 5))
        vu = sum(y_tied * (y_tied - 1) * (2 * y_tied + 5))
        v1 = sum(x_tied * (x_tied - 1)) * sum(y_tied * (y_tied - 1))
        v2 = sum(x_tied * (x_tied - 1) * (x_tied - 2)) * \
               sum(y_tied * (y_tied - 1) * (y_tied - 2))
        variance = (v0 - vt - vu) / 18 + v1 / (2 * n * (n - 1)) + v2 / (9 * n * \
                                                        (n - 1) * (n - 2))
    if return_p:
        return tau, zprob(stat / variance**0.5)
    else:
        return tau
Exemple #6
0
    def __init__(self, data=None, Info=None, **kwargs):
        """Intializes BaseUsage with data, either sequence or dict of freqs.
        
        Ignores additional kwargs (e.g. to support copy).

        Makes the _handler for delegator accessible with the name Info.
        """
        if Info is None:
            if hasattr(data, 'Info'):
                Info = data.Info
            else:
                Info = InfoClass()
        Delegator.__init__(self, Info)
        Freqs.__init__(self, data or [], **kwargs)
Exemple #7
0
    def __init__(self, data=None, Info=None, **kwargs):
        """Intializes BaseUsage with data, either sequence or dict of freqs.
        
        Ignores additional kwargs (e.g. to support copy).

        Makes the _handler for delegator accessible with the name Info.
        """
        if Info is None:
            if hasattr(data, 'Info'):
                Info = data.Info
            else:
                Info = InfoClass()
        Delegator.__init__(self, Info)
        Freqs.__init__(self, data or [], **kwargs)
Exemple #8
0
def random_source(a, k, random_f=random):
    """Makes a random Markov source on alphabet a with memory k.
    
    Specifically, for all words k, pr(i|k) = rand().
    """
    result = dict.fromkeys(list(map(''.join, cartesian_product([a]*k))))
    for k in result:
        result[k] = Freqs(dict(list(zip(a, random_f(len(a))))))
    return result
Exemple #9
0
    def toFreqs(self):
        """Returns a Freqs object based on the histogram.

        Labels of Freqs will be _bins converted into strings
        Values of Freqs will be the number of objects in a Bin
        """
        result = Freqs()
        for bin, values in self:
            result[str(bin)] = len(values)
        return result
Exemple #10
0
    def codons(self, genetic_code=SGC, codon_usage=_equal_codons):
        """Predicts most likely set of codon frequencies.

        Optionally uses genetic_code (to figure out which codons belong
        with each amino acid), and codon_usage (to get most likely codons for
        each amino acid). Defaults are the standard genetic code and unbiased
        codon frequencies.
        """
        result = {}
        normalized = Freqs(self)
        normalized.normalize()
        for aa, aa_freq in list(normalized.items()):
            curr_codons = [c.upper().replace('T','U') for c in genetic_code[aa]]
            if not curr_codons:
                continue    #code might be missing some amino acids?
            curr_codon_freqs = Numbers([codon_usage[c] for c in curr_codons])
            curr_codon_freqs.normalize()
            for codon, c_freq in zip(curr_codons, curr_codon_freqs):
                result[codon] = c_freq * aa_freq
        return CodonUsage(result, self.info, genetic_code)
Exemple #11
0
    def codons(self, genetic_code=SGC, codon_usage=_equal_codons):
        """Predicts most likely set of codon frequencies.

        Optionally uses genetic_code (to figure out which codons belong
        with each amino acid), and codon_usage (to get most likely codons for 
        each amino acid). Defaults are the standard genetic code and unbiased 
        codon frequencies.
        """
        result = {}
        normalized = Freqs(self)
        normalized.normalize()
        for aa, aa_freq in normalized.items():
            curr_codons = [c.upper().replace('T','U') for c in genetic_code[aa]]
            if not curr_codons:
                continue    #code might be missing some amino acids?
            curr_codon_freqs = Numbers([codon_usage[c] for c in curr_codons])
            curr_codon_freqs.normalize()
            for codon, c_freq in zip(curr_codons, curr_codon_freqs):
                result[codon] = c_freq * aa_freq
        return CodonUsage(result, self.info, genetic_code)
Exemple #12
0
 def apply_to(s):
     if s and not case_sens:
         used_s = [str(item).lower() for item in s]
     else:
         used_s = s
     fd = Freqs(used_s)
     value_list = [fd[i] for i in fd if i not in used_items]
     if value_list:
         count = reduce(add, value_list)
         return count > x
     else:
         return False
    def test_init(self):
        """Unpaired region should generate right freqs, even after change"""
        freqs = Freqs({"C": 10, "U": 1, "A": 0})
        r = UnpairedRegion("NN", freqs)
        seq = r.Current
        assert seq[0] in "CU"
        assert seq[1] in "CU"
        self.assertEqual(len(seq), 2)
        fd = []
        for i in range(1000):
            r.refresh()
            fd.append(str(seq))
        fd = Freqs("".join(fd))

        observed = [fd["C"], fd["U"]]
        expected = [1800, 200]
        self.assertSimilarFreqs(observed, expected)
        self.assertEqual(fd["U"] + fd["C"], 2000)

        freqs2 = Freqs({"A": 5, "U": 5})
        r.Composition = freqs2
        r.Template = "NNN"  # note that changing the Template changes seq ref
        seq = r.Current
        self.assertEqual(len(seq), 3)
        assert seq[0] in "AU"
        assert seq[1] in "AU"
        assert seq[2] in "AU"
        fd = []
        for i in range(1000):
            r.refresh()
            fd.append(str(seq))
        fd = Freqs("".join(fd))
        observed = [fd["A"], fd["U"]]
        expected = [1500, 1500]
        self.assertSimilarFreqs(observed, expected)
        self.assertEqual(fd["A"] + fd["U"], 3000)
Exemple #14
0
 def calcFrequencies(self, delete_bad_suffixes=True):
     """For order k, gets the (k-1)-word frequencies plus what follows."""
     #reset text if possible -- but it might just be a string, so don't
     #complain if the reset fails.
     overlapping=self.Overlapping
     try:
         self.Text.reset()
     except AttributeError:
         try:
             self.Text.seek(0)
         except AttributeError:
             pass
     k = self.Order
     if k < 1:   #must be 0 or '-1': just need to count single bases
         self._first_order_frequency_calculation()
     else:   #need to figure out what comes after the first k bases
         all_freqs = {}
         for line in self.Text:
             if not self.Linebreaks:
                 line = line.strip()
             #skip the line if it's blank
             if (not line):
                 continue
             #otherwise, make a frequency distribution of symbols
             end = len(line) - k
             if overlapping:
                 rang=range(end)
             else:
                 rang=range(0,end,(k+1))
             for i in rang:
                 word, next = line[i:i+k], line[i+k]
                 curr = all_freqs.get(word, None)
                 if curr is None:
                     curr = Freqs({next:1})
                     all_freqs[word] = curr
                 else:
                     curr += next
         if self._calc_entropy:
             self.Entropy = self._entropy(all_freqs)
         self.Frequencies = all_freqs
         if delete_bad_suffixes:
             self.deleteBadSuffixes()
         self.RawCounts=deepcopy(all_freqs)
         #preserve non-normalized freqs
         for dist in list(self.Frequencies.values()):
             dist.normalize()
Exemple #15
0
    def test_init(self):
        """Unpaired region should generate right freqs, even after change"""
        freqs = Freqs({'C':10,'U':1, 'A':0})
        r = UnpairedRegion('NN', freqs)
        seq = r.Current
        assert seq[0] in 'CU'
        assert seq[1] in 'CU'
        self.assertEqual(len(seq), 2)
        fd = []
        for i in range(1000):
            r.refresh()
            fd.append(str(seq))
        fd = Freqs(''.join(fd))

        observed = [fd['C'], fd['U']]
        expected = [1800, 200]
        self.assertSimilarFreqs(observed, expected)
        self.assertEqual(fd['U'] + fd['C'], 2000)

        freqs2 = Freqs({'A':5, 'U':5})
        r.Composition = freqs2
        r.Template = 'NNN'  #note that changing the Template changes seq ref
        seq = r.Current
        self.assertEqual(len(seq), 3)
        assert seq[0] in 'AU'
        assert seq[1] in 'AU'
        assert seq[2] in 'AU'
        fd = []
        for i in range(1000):
            r.refresh()
            fd.append(str(seq))
        fd = Freqs(''.join(fd))
        observed = [fd['A'], fd['U']]
        expected = [1500, 1500]
        self.assertSimilarFreqs(observed, expected)
        self.assertEqual(fd['A'] + fd['U'], 3000)
Exemple #16
0
    def _first_order_frequency_calculation(self):
        """Handles single-character calculations, which are independent.

        Specifically, don't need to take into account any other characters, and
        can just feed the whole thing into a single Freqs.
        """
        freqs = Freqs('')
        for line in self.Text:
            freqs += line
        #get rid of line breaks if necessary
        if not self.Linebreaks:
            for badkey in ['\r', '\n']:
                try:
                    del freqs[badkey]
                except KeyError:
                    pass    #don't care if there weren't any
        #if order is negative, equalize the frequencies
        if self.Order < 0:
            for key in freqs:
                freqs[key] = 1
        self.RawCounts= {'':deepcopy(freqs)}
        freqs.normalize()
        self.Frequencies = {'':freqs}
Exemple #17
0
 def test_init(self):
     """ConstantRegion should always return current template."""
     #test blank region model
     r = ConstantRegion()
     self.assertEqual(str(r.Current), '')
     self.assertEqual(len(r), 0)
     #now assign it to a template
     r.Template = ('ACGUUCGA')
     self.assertEqual(str(r.Current), 'ACGUUCGA')
     self.assertEqual(len(r), len('ACGUUCGA'))
     #check that refresh doesn't break anything
     r.refresh()
     self.assertEqual(str(r.Current), 'ACGUUCGA')
     self.assertEqual(len(r), len('ACGUUCGA'))
     #check composition
     self.assertEqual(r.Composition, None)
     d = {'A':3, 'U':10}
     r.Composition = Freqs(d)
     self.assertEqual(r.Composition, d)
     #check that composition doesn't break the update
     r.refresh()
     self.assertEqual(str(r.Current), 'ACGUUCGA')
     self.assertEqual(len(r), len('ACGUUCGA'))
Exemple #18
0
def MagePointFromBaseFreqs(freqs, get_label=None, get_color=None, get_radius=None):
    """Returns a MagePoint from an object with counts for the bases.
    
    get_label should be a function that calculates a label from the freqs.
    If get_label is not supplied, checks freqs.Label, freqs.Species, freqs.Id,
    freqs.Accession, and freqs.Name in that order. If get_label fails or none
    of the attributes is found, no label is written.

    get_color should be a function that calculates a color from the freqs. 
    Default is no color (i.e. the point has the color for the series), which
    will also happen if get_color fails.

    get_radius is similar to get_color.
    """
    label = None
    if get_label:
        try:
            label = get_label(freqs)
        except:
            pass  # label will be assigned None below
    else:
        for attr in ["Label", "Species", "Id", "Accession", "Name"]:
            if hasattr(freqs, attr):
                label = getattr(freqs, attr)
                # keep going if the label is empty
                if label is not None and label != "":
                    break
    if not label and label != 0:
        label = None
    if get_color:
        try:
            color = get_color(freqs)
        except:
            color = None
    else:
        if hasattr(freqs, "Color"):
            color = freqs.Color
        else:
            color = None

    if get_radius:
        try:
            radius = get_radius(freqs)
        except:
            radius = None
    else:
        if hasattr(freqs, "Radius"):
            try:
                radius = float(freqs.Radius)
            except:
                radius = None
        else:
            radius = None

    relevant = Freqs(
        {
            "A": freqs.get("A", 0),
            "C": freqs.get("C", 0),
            "G": freqs.get("G", 0),
            "U": freqs.get("U", 0) or freqs.get("T", 0),
        }
    )
    relevant.normalize()
    return MagePoint((relevant["A"], relevant["C"], relevant["G"]), Label=label, Color=color, Radius=radius)
Exemple #19
0
 def test_init(self):
     """BaseFrequency should init as expected"""
     self.assertEqual(BaseFrequency('UUUCCCCAG'), \
                      Freqs('UUUCCCCAG', 'UCAG'))
     self.assertEqual(BaseFrequency('TTTCAGG', RNA=False), \
                      Freqs('TTTCAGG'))
Exemple #20
0
#RnaBases = 'UCAG'
#DnaBases = 'TCAG'
RnaCodons = [i + j + k for i in RnaBases for j in RnaBases for k in RnaBases]
DnaCodons = [i + j + k for i in DnaBases for j in DnaBases for k in DnaBases]
#AminoAcids = 'ACDEFGHIKLMNPQRSTVWY*'
SGC = GeneticCodes[1]

RnaDinucs = [i + j for i in RnaBases for j in RnaBases]

RnaToDna = dict(zip(RnaBases, DnaBases))
DnaToRna = dict(zip(DnaBases, RnaBases))

Bases = RnaBases  #by default
Codons = RnaCodons  #by default

_equal_bases = Freqs(Bases)
_equal_codons = Freqs(Codons)
_equal_amino_acids = Freqs(AminoAcids[:-1])  #exclude Stop
for i in (_equal_bases, _equal_codons, _equal_amino_acids):
    i.normalize()

empty_rna_codons = dict.fromkeys(RnaCodons, 0.0)
empty_dna_codons = dict.fromkeys(DnaCodons, 0.0)


def seq_to_codon_dict(seq, empty_codons=empty_dna_codons):
    """Converts sequence into codon dict."""
    leftover = len(seq) % 3
    if leftover:
        seq += 'A' * (3 - leftover)
    result = empty_codons.copy()