def test_init_one_parameter(self): """PairFrequency should interpret single parameter as pair probs""" obs = PairFrequency('UCCC') exp = Freqs({('U','U'):0.0625, ('U','C'):0.1875, ('C','U'):0.1875, ('C','C'):0.5625}) for k, v in exp.items(): self.assertEqual(v, obs[k]) for k, v in obs.items(): if k not in exp: self.assertEqual(v, 0) self.assertEqual(PairFrequency('UCCC', [('U','U'),('C','C')]), \ Freqs({('U','U'):0.1, ('C','C'):0.9})) #check that the alphabets are right: should not raise error on #incrementing characters already there, but should raise KeyError #on anything that's missing. p = PairFrequency('UCCC') p[('U','U')] += 1 try: p[('X','U')] += 1 except KeyError: pass else: raise AssertionError, "Expected KeyError." p = PairFrequency('UCCC', (('C','C'),)) p[('C','C')] += 1 try: p[('U','U')] += 1 except KeyError: pass else: raise AssertionError, "Expected KeyError."
def test_known_vals(self): """Composition should return precalculated elements for known cases""" self.assertEqual(len(Composition(5,1,"ACGU")), 969) self.assertEqual(len(Composition(5,0,"ACGU")), 1771) as_list = list(Composition(5,1,"ACGU")) self.assertEqual(as_list[0], Freqs('A'*17+'CGU')) self.assertEqual(as_list[-1], Freqs('U'*17+'ACG'))
def kendalls_tau(x, y, return_p=True): """returns kendall's tau Arguments: - return_p: returns the probability from the normal approximation when True, otherwise just returns tau""" ranked = as_paired_ranks(x, y) n = len(ranked) denom = n * (n - 1) / 2 con = 0 discor = 0 x_tied = 0 y_tied = 0 for i in range(n - 1): x_1 = ranked[i][0] y_1 = ranked[i][1] for j in range(i + 1, n): x_2 = ranked[j][0] y_2 = ranked[j][1] x_diff = x_1 - x_2 y_diff = y_1 - y_2 if x_diff * y_diff > 0: con += 1 elif x_diff and y_diff: discor += 1 else: if x_diff: y_tied += 1 if y_diff: x_tied += 1 diff = con - discor total = con + discor denom = ((total + y_tied) * (total + x_tied))**0.5 variance = (4 * n + 10) / (9 * n * (n - 1)) tau = diff / denom stat = tau if x_tied or y_tied: x_tied = array([v for v in Freqs(x).itervalues() if v > 1]) y_tied = array([v for v in Freqs(y).itervalues() if v > 1]) t0 = n * (n - 1) / 2 t1 = sum(x_tied * (x_tied - 1)) / 2 t2 = sum(y_tied * (y_tied - 1)) / 2 stat = tau * sqrt((t0 - t1) * (t0 - t2)) v0 = n * (n - 1) * (2 * n + 5) vt = sum(x_tied * (x_tied - 1) * (2 * x_tied + 5)) vu = sum(y_tied * (y_tied - 1) * (2 * y_tied + 5)) v1 = sum(x_tied * (x_tied - 1)) * sum(y_tied * (y_tied - 1)) v2 = sum(x_tied * (x_tied - 1) * (x_tied - 2)) * \ sum(y_tied * (y_tied - 1) * (y_tied - 2)) variance = (v0 - vt - vu) / 18 + v1 / (2 * n * (n - 1)) + v2 / (9 * n * \ (n - 1) * (n - 2)) if return_p: return tau, zprob(stat / variance**0.5) else: return tau
def MagePointFromBaseFreqs(freqs, get_label=None, get_color=None, \ get_radius=None): """Returns a MagePoint from an object with counts for the bases. get_label should be a function that calculates a label from the freqs. If get_label is not supplied, checks freqs.Label, freqs.Species, freqs.Id, freqs.Accession, and freqs.Name in that order. If get_label fails or none of the attributes is found, no label is written. get_color should be a function that calculates a color from the freqs. Default is no color (i.e. the point has the color for the series), which will also happen if get_color fails. get_radius is similar to get_color. """ label = None if get_label: try: label = get_label(freqs) except: pass #label will be assigned None below else: for attr in ['Label', 'Species', 'Id', 'Accession', 'Name']: if hasattr(freqs, attr): label = getattr(freqs, attr) #keep going if the label is empty if label is not None and label != '': break if not label and label != 0: label = None if get_color: try: color = get_color(freqs) except: color=None else: if hasattr(freqs, 'Color'): color = freqs.Color else: color = None if get_radius: try: radius = get_radius(freqs) except: radius=None else: if hasattr(freqs, 'Radius'): try: radius = float(freqs.Radius) except: radius = None else: radius = None relevant = Freqs({'A':freqs.get('A',0), 'C':freqs.get('C',0), 'G':freqs.get('G',0), 'U':freqs.get('U',0) or freqs.get('T',0)}) relevant.normalize() return MagePoint((relevant['A'],relevant['C'],relevant['G']), Label=label,\ Color=color, Radius=radius)
def random_source(a, k, random_f=random): """Makes a random Markov source on alphabet a with memory k. Specifically, for all words k, pr(i|k) = rand(). """ result = dict.fromkeys(list(map(''.join, cartesian_product([a]*k)))) for k in result: result[k] = Freqs(dict(list(zip(a, random_f(len(a)))))) return result
def toFreqs(self): """Returns a Freqs object based on the histogram. Labels of Freqs will be _bins converted into strings Values of Freqs will be the number of objects in a Bin """ result = Freqs() for bin, values in self: result[str(bin)] = len(values) return result
def apply_to(s): if s and not case_sens: used_s = [str(item).lower() for item in s] else: used_s = s fd = Freqs(used_s) value_list = [fd[i] for i in fd if i not in used_items] if value_list: count = reduce(add, value_list) return count > x else: return False
def test_init(self): """Unpaired region should generate right freqs, even after change""" freqs = Freqs({'C':10,'U':1, 'A':0}) r = UnpairedRegion('NN', freqs) seq = r.Current assert seq[0] in 'CU' assert seq[1] in 'CU' self.assertEqual(len(seq), 2) fd = [] for i in range(1000): r.refresh() fd.append(str(seq)) fd = Freqs(''.join(fd)) observed = [fd['C'], fd['U']] expected = [1800, 200] self.assertSimilarFreqs(observed, expected) self.assertEqual(fd['U'] + fd['C'], 2000) freqs2 = Freqs({'A':5, 'U':5}) r.Composition = freqs2 r.Template = 'NNN' #note that changing the Template changes seq ref seq = r.Current self.assertEqual(len(seq), 3) assert seq[0] in 'AU' assert seq[1] in 'AU' assert seq[2] in 'AU' fd = [] for i in range(1000): r.refresh() fd.append(str(seq)) fd = Freqs(''.join(fd)) observed = [fd['A'], fd['U']] expected = [1500, 1500] self.assertSimilarFreqs(observed, expected) self.assertEqual(fd['A'] + fd['U'], 3000)
def calcFrequencies(self, delete_bad_suffixes=True): """For order k, gets the (k-1)-word frequencies plus what follows.""" #reset text if possible -- but it might just be a string, so don't #complain if the reset fails. overlapping=self.Overlapping try: self.Text.reset() except AttributeError: try: self.Text.seek(0) except AttributeError: pass k = self.Order if k < 1: #must be 0 or '-1': just need to count single bases self._first_order_frequency_calculation() else: #need to figure out what comes after the first k bases all_freqs = {} for line in self.Text: if not self.Linebreaks: line = line.strip() #skip the line if it's blank if (not line): continue #otherwise, make a frequency distribution of symbols end = len(line) - k if overlapping: rang=range(end) else: rang=range(0,end,(k+1)) for i in rang: word, next = line[i:i+k], line[i+k] curr = all_freqs.get(word, None) if curr is None: curr = Freqs({next:1}) all_freqs[word] = curr else: curr += next if self._calc_entropy: self.Entropy = self._entropy(all_freqs) self.Frequencies = all_freqs if delete_bad_suffixes: self.deleteBadSuffixes() self.RawCounts=deepcopy(all_freqs) #preserve non-normalized freqs for dist in list(self.Frequencies.values()): dist.normalize()
def codons(self, genetic_code=SGC, codon_usage=_equal_codons): """Predicts most likely set of codon frequencies. Optionally uses genetic_code (to figure out which codons belong with each amino acid), and codon_usage (to get most likely codons for each amino acid). Defaults are the standard genetic code and unbiased codon frequencies. """ result = {} normalized = Freqs(self) normalized.normalize() for aa, aa_freq in list(normalized.items()): curr_codons = [c.upper().replace('T','U') for c in genetic_code[aa]] if not curr_codons: continue #code might be missing some amino acids? curr_codon_freqs = Numbers([codon_usage[c] for c in curr_codons]) curr_codon_freqs.normalize() for codon, c_freq in zip(curr_codons, curr_codon_freqs): result[codon] = c_freq * aa_freq return CodonUsage(result, self.info, genetic_code)
def _first_order_frequency_calculation(self): """Handles single-character calculations, which are independent. Specifically, don't need to take into account any other characters, and can just feed the whole thing into a single Freqs. """ freqs = Freqs('') for line in self.Text: freqs += line #get rid of line breaks if necessary if not self.Linebreaks: for badkey in ['\r', '\n']: try: del freqs[badkey] except KeyError: pass #don't care if there weren't any #if order is negative, equalize the frequencies if self.Order < 0: for key in freqs: freqs[key] = 1 self.RawCounts= {'':deepcopy(freqs)} freqs.normalize() self.Frequencies = {'':freqs}
def test_init(self): """ConstantRegion should always return current template.""" #test blank region model r = ConstantRegion() self.assertEqual(str(r.Current), '') self.assertEqual(len(r), 0) #now assign it to a template r.Template = ('ACGUUCGA') self.assertEqual(str(r.Current), 'ACGUUCGA') self.assertEqual(len(r), len('ACGUUCGA')) #check that refresh doesn't break anything r.refresh() self.assertEqual(str(r.Current), 'ACGUUCGA') self.assertEqual(len(r), len('ACGUUCGA')) #check composition self.assertEqual(r.Composition, None) d = {'A':3, 'U':10} r.Composition = Freqs(d) self.assertEqual(r.Composition, d) #check that composition doesn't break the update r.refresh() self.assertEqual(str(r.Current), 'ACGUUCGA') self.assertEqual(len(r), len('ACGUUCGA'))
#RnaBases = 'UCAG' #DnaBases = 'TCAG' RnaCodons = [i + j + k for i in RnaBases for j in RnaBases for k in RnaBases] DnaCodons = [i + j + k for i in DnaBases for j in DnaBases for k in DnaBases] #AminoAcids = 'ACDEFGHIKLMNPQRSTVWY*' SGC = GeneticCodes[1] RnaDinucs = [i + j for i in RnaBases for j in RnaBases] RnaToDna = dict(zip(RnaBases, DnaBases)) DnaToRna = dict(zip(DnaBases, RnaBases)) Bases = RnaBases #by default Codons = RnaCodons #by default _equal_bases = Freqs(Bases) _equal_codons = Freqs(Codons) _equal_amino_acids = Freqs(AminoAcids[:-1]) #exclude Stop for i in (_equal_bases, _equal_codons, _equal_amino_acids): i.normalize() empty_rna_codons = dict.fromkeys(RnaCodons, 0.0) empty_dna_codons = dict.fromkeys(DnaCodons, 0.0) def seq_to_codon_dict(seq, empty_codons=empty_dna_codons): """Converts sequence into codon dict.""" leftover = len(seq) % 3 if leftover: seq += 'A' * (3 - leftover) result = empty_codons.copy()
def test_init(self): """BaseFrequency should init as expected""" self.assertEqual(BaseFrequency('UUUCCCCAG'), \ Freqs('UUUCCCCAG', 'UCAG')) self.assertEqual(BaseFrequency('TTTCAGG', RNA=False), \ Freqs('TTTCAGG'))