Esempio n. 1
0
    def test_protein(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)

        alpha = 0.3

        stats1 = Statistics(align)
        stats2 = Statistics(align,
                            regularization_amount=alpha,
                            regularizer='pseudocount')

        expected_f1 = (1 - alpha) * stats1.freq1 + alpha / 21.0
        expected_f2 = (1 - alpha) * stats1.freq2 + alpha / 21.0**2

        # block-diagonal needs correction because those variables are not independent
        for i in range(align.get_width()):
            idxs = slice(20 * i, 20 * (i + 1))
            # noinspection PyUnresolvedReferences
            expected_f2[idxs, idxs] = np.diag(expected_f1[idxs])

        expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1)

        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(expected_f1, stats2.freq1))
        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(expected_f2, stats2.freq2))
        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(expected_cmat, stats2.cmat))
Esempio n. 2
0
 def test_multi_alpha_shape_and_symmetry_of_couplings(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet
     from multicov.align_io import load_fasta
     from multicov.statistics import Statistics, MaxentModel
     from os.path import join
     align1 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         dna_alphabet,
                         invalid_letter_policy='gap')
     align2 = load_fasta(join('test_data', 'test_aln1.fasta'),
                         protein_alphabet,
                         invalid_letter_policy='gap')
     align3 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         rna_alphabet,
                         invalid_letter_policy='uppergap')
     align = Alignment(align1)
     align.add(align2).add(align3)
     stats = Statistics(align, regularization_amount=0.5)
     maxent = MaxentModel(stats)
     self.assertLess(np.max(np.abs(maxent.couplings - maxent.couplings.T)),
                     1e-10)
     self.assertSequenceEqual(
         np.shape(maxent.couplings), 2 * [
             4 * (align1.get_width() + align3.get_width()) +
             20 * align2.get_width()
         ])
Esempio n. 3
0
    def test_delayed_evaluation(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)

        stats = Statistics(align)

        old_expected_f1 = _slow_get_freq1(align)
        old_expected_f2 = _slow_get_freq2(align)
        old_expected_cmat = old_expected_f2 - np.outer(old_expected_f1,
                                                       old_expected_f1)

        # modify align, and test that the statistics are calculated for the modified one
        align.data[1, :] = list('CMPRY')
        align.data[10, :] = list('KHRCD')

        expected_f1 = _slow_get_freq1(align)
        expected_f2 = _slow_get_freq2(align)
        expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1)

        self.assertFalse(np.allclose(expected_f1, old_expected_f1))
        self.assertFalse(np.allclose(expected_f2, old_expected_f2))
        self.assertFalse(np.allclose(expected_cmat, old_expected_cmat))

        self.assertTrue(np.allclose(stats.freq1, expected_f1))
        self.assertTrue(np.allclose(stats.freq2, expected_f2))
        self.assertTrue(np.allclose(stats.cmat, expected_cmat))
Esempio n. 4
0
 def test_multi_alpha_diagonalness_of_blockdiagonal_blocks(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet
     from multicov.align_io import load_fasta
     from multicov.binary import binary_index_map
     from multicov.statistics import Statistics, MaxentModel
     from os.path import join
     align1 = load_fasta(join('test_data', 'test_aln1.fasta'),
                         protein_alphabet,
                         invalid_letter_policy='gap')
     align2 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         dna_alphabet,
                         invalid_letter_policy='gap')
     align3 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         rna_alphabet,
                         invalid_letter_policy='gap')
     align = Alignment(align1)
     align.add(align2).add(align3)
     stats = Statistics(align, regularization_amount=0.5)
     maxent = MaxentModel(stats)
     bin_map = binary_index_map(stats)
     for crt_range in bin_map:
         crt_slice = slice(crt_range[0], crt_range[1])
         crt_block = maxent.couplings[crt_slice, crt_slice]
         self.assertLess(
             np.max(np.abs(crt_block - np.diag(np.diag(crt_block)))), 1e-10)
Esempio n. 5
0
    def test_multi_alpha(self):
        from multicov.alphabet import protein_alphabet, dna_alphabet
        from multicov.align_io import load_fasta
        from multicov.statistics import Statistics
        from os.path import join
        align = load_fasta(join('test_data', 'test_aln2.fasta'),
                           dna_alphabet,
                           invalid_letter_policy='gap')
        align2 = load_fasta(join('test_data', 'test_aln1.fasta'),
                            protein_alphabet,
                            invalid_letter_policy='gap')
        align.add(align2)

        alpha = 0.6

        stats1 = Statistics(align)
        stats2 = Statistics(align,
                            regularization_amount=alpha,
                            regularizer='pseudocount')

        bkg_freq1 = np.hstack(
            np.ones(width * alphabet.size(no_gap=True)) / alphabet.size()
            for alphabet, width in align.alphabets)
        bkg_freq2 = np.outer(bkg_freq1, bkg_freq1)

        freq1 = (1 - alpha) * stats1.freq1 + alpha * bkg_freq1
        freq2 = (1 - alpha) * stats1.freq2 + alpha * bkg_freq2

        n_letts = np.hstack(width * [alphabet.size(no_gap=True)]
                            for alphabet, width in align.alphabets)
        idxs0 = np.hstack(([0], np.cumsum(n_letts)[:-1]))
        for idx0, n_lett in zip(idxs0, n_letts):
            idxs = slice(idx0, idx0 + n_lett)
            # noinspection PyUnresolvedReferences
            freq2[idxs, idxs] = np.diag(freq1[idxs])

        cmat = freq2 - np.outer(freq1, freq1)

        self.assertTrue(np.allclose(freq1, stats2.freq1))
        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(freq2, stats2.freq2))
        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(cmat, stats2.cmat))
Esempio n. 6
0
 def test_get_str_goes_to_annotations(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics
     align = Alignment([
         'WKHNAY', 'KHRCDA', 'LGVVGY', 'LIGDDH', 'CMPRYW', 'QWFWRA',
         'VTMPEG', 'LNYINM', 'WHV-EW', 'PIWGGF', 'PPCWVE', 'E-MWRG',
         'RFGKFT', 'CGRCGS', 'T-PMVW', 'LNCPYA'
     ], protein_alphabet)
     stats = Statistics(align)
     self.assertIs(stats['seqw'], align['seqw'])
Esempio n. 7
0
 def test_against_lucy_noseqw(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics
     from scipy.io import loadmat
     import os.path
     lucy = loadmat(os.path.join('test_data', 'lucy_dca_pdz_small.mat'),
                    squeeze_me=True)
     align = Alignment(lucy['alignment']['data'][()], protein_alphabet)
     stats = Statistics(align, regularization_amount=0.5)
     self.assertTrue(np.allclose(stats.cmat, lucy['DCAmat_noseqw']))
Esempio n. 8
0
 def test_precompute(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics
     align = Alignment([
         'WKHNAY', 'KHRCDA', 'LGVVGY', 'LIGDDH', 'CMPRYW', 'QWFWRA',
         'VTMPEG', 'LNYINM', 'WHV-EW', 'PIWGGF', 'PPCWVE', 'E-MWRG',
         'RFGKFT', 'CGRCGS', 'T-PMVW', 'LNCPYA'
     ], protein_alphabet)
     stats = Statistics(align, precompute=('freq1', 'freq2'))
     self.assertIsNotNone(stats._freq1)
     self.assertIsNotNone(stats._freq2)
Esempio n. 9
0
 def test_freq2_on_protein(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics
     align = Alignment([
         'WKHNAY', 'KHRCDA', 'LGVVGY', 'LIGDDH', 'CMPRYW', 'QWFWRA',
         'VTMPEG', 'LNYINM', 'WHV-EW', 'PIWGGF', 'PPCWVE', 'E-MWRG',
         'RFGKFT', 'CGRCGS', 'T-PMVW', 'LNCPYA'
     ], protein_alphabet)
     stats = Statistics(align)
     expected_f2 = _slow_get_freq2(align)
     self.assertTrue(np.allclose(stats.freq2, expected_f2))
Esempio n. 10
0
 def test_freq1_on_protein(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics
     align = Alignment([
         'WKHNAYDML', 'KHRCDANC-', 'LGVVGYYFK', 'LIGDDHRN-', 'CMPRYWYTY',
         'QWFWRARPF', 'VTMPEGHHC', 'LNYINMHVD', 'WHV-EWKPV', 'PIWGGFNFP',
         'PPCWVEAPY', 'E-MWRGLIW', 'RFGKFTCMG', 'CGRCGSH-E', 'T-PMVWRLV',
         'LNCPYADLD'
     ], protein_alphabet)
     stats = Statistics(align)
     expected_f1 = _slow_get_freq1(align)
     self.assertTrue(np.allclose(stats.freq1, expected_f1))
Esempio n. 11
0
 def test_against_matlab_example(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics, MaxentModel
     from scipy.io import loadmat
     import os.path
     matlab = loadmat(os.path.join('test_data', 'maxent_sample.mat'),
                      squeeze_me=True)
     align = Alignment(matlab['alignment']['data'][()], protein_alphabet)
     stats = Statistics(align, regularization_amount=0.5)
     maxent = MaxentModel(stats)
     energies = maxent.score(align)
     self.assertTrue(np.allclose(energies, matlab['energies'][()]))
Esempio n. 12
0
 def test_cmat_on_protein(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics
     align = Alignment([
         'WKHNAYD', 'KHRCDAN', 'LGVVGYY', 'LIGDDHR', 'CMPRYWY', 'QWFWRAR',
         'VTMPEGH', 'LNYINMH', 'WHV-EWK', 'PIWGGFN', 'PPCWVEA', 'E-MWRGL',
         'RFGKFTC', 'CGRCGSH', 'T-PMVWR', 'LNCPYAD'
     ], protein_alphabet)
     stats = Statistics(align)
     expected_f1 = _slow_get_freq1(align)
     expected_f2 = _slow_get_freq2(align)
     expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1)
     self.assertTrue(np.allclose(stats.cmat, expected_cmat))
Esempio n. 13
0
    def test_delayed_evaluation(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)

        stats = Statistics(align)
        self.assertIsNone(stats._freq1)
        self.assertIsNone(stats._freq2)
        self.assertIsNone(stats._cmat)
Esempio n. 14
0
    def test_copy_alpha_annots_refmap_by_ref(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)

        stats = Statistics(align)
        self.assertIs(stats.alphabets, align.alphabets)
        self.assertIs(stats.reference, align.reference)
        self.assertIs(stats.annotations, align.annotations)
Esempio n. 15
0
 def test_with_matrix(self):
     from multicov.alignment import Alignment
     from multicov.binary import binary_index_map
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics, MaxentModel
     align = Alignment([
         'WKHNAY', 'KHRCDA', 'LGVVGY', 'LIGDDH', 'CMPRYW', 'QWFWRA',
         'VTMPEG', 'LNYINM', 'WHV-EW', 'PIWGGF', 'PPCWVE', 'E-MWRG',
         'RFGKFT', 'CGRCGS', 'T-PMVW', 'LNCPYA'
     ], protein_alphabet)
     stats = Statistics(align, regularization_amount=0.1)
     maxent = MaxentModel(stats)
     energies = maxent.score(align.data)
     energies0 = maxent.score(align)
     self.assertTrue(np.allclose(energies, energies0))
Esempio n. 16
0
 def test_freq2_on_multi_alpha(self):
     from multicov.alphabet import protein_alphabet, dna_alphabet
     from multicov.align_io import load_fasta
     from multicov.statistics import Statistics
     from os.path import join
     align = load_fasta(join('test_data', 'test_aln2.fasta'),
                        dna_alphabet,
                        invalid_letter_policy='gap')
     align2 = load_fasta(join('test_data', 'test_aln1.fasta'),
                         protein_alphabet,
                         invalid_letter_policy='gap')
     align.add(align2)
     stats = Statistics(align)
     expected_f2 = _slow_get_freq2(align)
     self.assertTrue(np.allclose(stats.freq2, expected_f2))
Esempio n. 17
0
    def test_on_empty(self):
        from multicov.alignment import Alignment
        from multicov.statistics import Statistics, MaxentModel
        stats = Statistics(Alignment())
        maxent = MaxentModel(stats)
        self.assertTrue(hasattr(maxent, 'stats'))
        self.assertTrue(hasattr(maxent, 'alphabets'))
        self.assertTrue(hasattr(maxent, 'annotations'))
        self.assertTrue(hasattr(maxent, 'reference'))

        self.assertIs(maxent.stats, stats)
        self.assertIs(maxent.alphabets, stats.alphabets)
        self.assertIs(maxent.annotations, stats.annotations)
        self.assertIs(maxent.reference, stats.reference)
        self.assertEqual(np.size(maxent.couplings), 0)
Esempio n. 18
0
 def test_on_empty_binalign(self):
     from multicov.binary import BinaryAlignment
     from multicov.statistics import Statistics
     stats = Statistics(BinaryAlignment())
     self.assertTrue(hasattr(stats, 'freq1'))
     self.assertTrue(hasattr(stats, 'freq2'))
     self.assertTrue(hasattr(stats, 'cmat'))
     self.assertTrue(hasattr(stats, 'alphabets'))
     self.assertTrue(hasattr(stats, 'reference'))
     self.assertTrue(hasattr(stats, 'annotations'))
     self.assertEqual(len(stats.freq1), 0)
     self.assertEqual(len(stats.freq2), 0)
     self.assertEqual(len(stats.cmat), 0)
     self.assertEqual(len(stats.alphabets), 0)
     self.assertEqual(len(stats.reference), 0)
     self.assertEqual(len(stats.annotations), 0)
Esempio n. 19
0
 def test_against_matlab_example(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics, MaxentModel
     from scipy.io import loadmat
     import os.path
     matlab = loadmat(os.path.join('test_data', 'maxent_sample.mat'),
                      squeeze_me=True)
     align = Alignment(matlab['alignment']['data'][()], protein_alphabet)
     stats = Statistics(align, regularization_amount=0.5)
     maxent = MaxentModel(stats)
     self.assertTrue(np.allclose(stats.cmat, matlab['dca']['cmat'][()]))
     self.assertTrue(
         np.allclose(
             maxent.couplings,
             matlab['params_nogap_nofc_nodiagtrick']['couplings'][()]))
Esempio n. 20
0
    def test_on_protein_binalign(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)

        stats = Statistics(align.to_binary())
        expected_f1 = _slow_get_freq1(align)
        expected_f2 = _slow_get_freq2(align)
        expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1)

        self.assertTrue(np.allclose(stats.freq1, expected_f1))
        self.assertTrue(np.allclose(stats.freq2, expected_f2))
        self.assertTrue(np.allclose(stats.cmat, expected_cmat))
Esempio n. 21
0
 def test_on_protein(self):
     from multicov.alignment import Alignment
     from multicov.binary import binary_index_map
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics, MaxentModel
     align = Alignment([
         'WKHNAY', 'KHRCDA', 'LGVVGY', 'LIGDDH', 'CMPRYW', 'QWFWRA',
         'VTMPEG', 'LNYINM', 'WHV-EW', 'PIWGGF', 'PPCWVE', 'E-MWRG',
         'RFGKFT', 'CGRCGS', 'T-PMVW', 'LNCPYA'
     ], protein_alphabet)
     stats = Statistics(align, regularization_amount=0.1)
     maxent = MaxentModel(stats)
     invC = -np.linalg.inv(stats.cmat)
     bin_map = binary_index_map(stats)
     for crt_range in bin_map:
         invC[crt_range[0]:crt_range[1], crt_range[0]:crt_range[1]] = 0
     self.assertTrue(
         np.allclose(invC,
                     maxent.couplings - np.diag(np.diag(maxent.couplings))))
Esempio n. 22
0
 def test_cmat_on_multi_alpha(self):
     from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet
     from multicov.align_io import load_fasta
     from multicov.statistics import Statistics
     from os.path import join
     align = load_fasta(join('test_data', 'test_aln2.fasta'),
                        dna_alphabet,
                        invalid_letter_policy='gap')
     align2 = load_fasta(join('test_data', 'test_aln1.fasta'),
                         protein_alphabet,
                         invalid_letter_policy='gap')
     align3 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         rna_alphabet,
                         invalid_letter_policy='uppergap')
     align.add(align2).add(align3)
     stats = Statistics(align)
     expected_f1 = _slow_get_freq1(align)
     expected_f2 = _slow_get_freq2(align)
     expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1)
     self.assertTrue(np.allclose(stats.cmat, expected_cmat))
Esempio n. 23
0
 def test_gap_gauge(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet
     from multicov.align_io import load_fasta
     from multicov.statistics import Statistics, MaxentModel
     from os.path import join
     align1 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         dna_alphabet,
                         invalid_letter_policy='gap')
     align2 = load_fasta(join('test_data', 'test_aln1.fasta'),
                         protein_alphabet,
                         invalid_letter_policy='gap')
     align3 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         rna_alphabet,
                         invalid_letter_policy='uppergap')
     align = Alignment(align1)
     align.add(align2).add(align3)
     stats = Statistics(align, regularization_amount=0.5)
     maxent = MaxentModel(stats)
     energies = maxent.score([align.get_width() * '-'])
     self.assertLess(np.max(np.abs(energies)), 1e-10)