def test_paralinear_distance(self): """calculate paralinear variance consistent with hand calculation""" data = [ ('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT" ), ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC" ) ] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(zip('ACGT', range(4))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) dist = -0.25 * numpy.log( numpy.linalg.det(J) / \ numpy.sqrt(f[0].prod() * f[1].prod()) ) self.assertFloatEqual(paralinear_calc.Dists[1, 1], dist, eps=1e-3)
def test_paralinear_variance(self): """calculate paralinear variance consistent with hand calculation""" data = [ ('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT" ), ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC" ) ] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(zip('ACGT', range(4))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) var = 0. for i in range(4): for j in range(4): var += M[j, i]**2 * J[i, j] var -= 1 / numpy.sqrt(f[0][i] * f[1][i]) var /= 16 * len(data[0][1]) self.assertFloatEqual(paralinear_calc.Variances[1, 1], var, eps=1e-3)
def test_paralinear_variance(self): """calculate paralinear variance consistent with hand calculation""" data = [('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"), ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC")] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(zip('ACGT', range(4))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) var = 0. for i in range(4): for j in range(4): var += M[j, i]**2 * J[i, j] var -= 1 / numpy.sqrt(f[0][i] * f[1][i]) var /= 16 * len(data[0][1]) self.assertFloatEqual(paralinear_calc.Variances[1,1], var, eps=1e-3)
def test_paralinear_pair_aa(self): """paralinear shouldn't fail to produce distances for aa seqs""" aln = LoadSeqs('data/brca1_5.paml', moltype=DNA) aln = aln.getTranslation() paralinear_calc = ParalinearPair(moltype=PROTEIN, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances()
def get_paralinear_distances(gene, data_directory=None, third_position=False, **kw): filenames = glob.glob(os.path.join(data_directory, gene+'.fasta*')) assert len(filenames) == 1, 'Wrong number of alignment files for ' + gene filename = filenames[0] if filename.endswith('.fasta'): with open(filename) as fastafile: fastadata = fastafile.read() elif filename.endswith('.fasta.gz'): with GzipFile(filename) as fastafile: fastadata = fastafile.read() else: raise RuntimeError(gene + ' file could not be read') sequences = LoadSeqs(data=fastadata) if third_position: indices = [(i, i+1) for i in range(len(sequences))[2::3]] pos3 = sequences.addFeature('pos3', 'pos3', indices) sequences = pos3.getSlice() sequences = sequences.filtered(lambda x: set(''.join(x)) <= set(DNA)) paralinear_calc = ParalinearPair(moltype=DNA, alignment=sequences) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances() return {frozenset(k):v for k, v in dists.items()}
def test_paralinear_pair_dna(self): """calculate paralinear distance consistent with logdet distance""" data = [('seq1', 'TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA'), ('seq2', 'AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG')] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(show_progress=False) self.assertFloatEqual(logdet_calc.Dists[1,1], paralinear_calc.Dists[1,1], eps=1e-3) self.assertFloatEqual(paralinear_calc.Variances[1,1], logdet_calc.Variances[1,1], eps=1e-3)
def test_paralinear_pair_dna(self): """calculate paralinear distance consistent with logdet distance""" data = [ ('seq1', 'TAATTCATTGGGACGTCGAATCCGGCAGTCCTGCCGCAAAAGCTTCCGGAATCGAATTTTGGCA' ), ('seq2', 'AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGG' ) ] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(show_progress=False) self.assertFloatEqual(logdet_calc.Dists[1, 1], paralinear_calc.Dists[1, 1], eps=1e-3) self.assertFloatEqual(paralinear_calc.Variances[1, 1], logdet_calc.Variances[1, 1], eps=1e-3)
def test_paralinear_distance(self): """calculate paralinear variance consistent with hand calculation""" data = [('seq1', "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT"), ('seq2', "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC")] aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) index = dict(zip('ACGT', range(4))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) f = J.sum(1), J.sum(0) dist = -0.25 * numpy.log( numpy.linalg.det(J) / \ numpy.sqrt(f[0].prod() * f[1].prod()) ) self.assertFloatEqual(paralinear_calc.Dists[1,1], dist, eps=1e-3)
def test_paralinear_for_determinant_lte_zero(self): """returns distance of None if the determinant is <= 0""" data = dict( seq1= "AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2= "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC") aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances() self.assertTrue(dists.values()[0] is None) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances() self.assertTrue(dists.values()[0] is None)
def test_paralinear_for_determinant_lte_zero(self): """returns distance of None if the determinant is <= 0""" data = dict(seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC") aln = LoadSeqs(data=data, moltype=DNA) paralinear_calc = ParalinearPair(moltype=DNA, alignment=aln) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances() self.assertTrue(dists.values()[0] is None) paralinear_calc.run(show_progress=False) dists = paralinear_calc.getPairwiseDistances() self.assertTrue(dists.values()[0] is None)