def test_multi_alpha_diagonalness_of_blockdiagonal_blocks(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet from multicov.align_io import load_fasta from multicov.binary import binary_index_map from multicov.statistics import Statistics, MaxentModel from os.path import join align1 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align3 = load_fasta(join('test_data', 'test_aln2.fasta'), rna_alphabet, invalid_letter_policy='gap') align = Alignment(align1) align.add(align2).add(align3) stats = Statistics(align, regularization_amount=0.5) maxent = MaxentModel(stats) bin_map = binary_index_map(stats) for crt_range in bin_map: crt_slice = slice(crt_range[0], crt_range[1]) crt_block = maxent.couplings[crt_slice, crt_slice] self.assertLess( np.max(np.abs(crt_block - np.diag(np.diag(crt_block)))), 1e-10)
def test_multi_alpha_shape_and_symmetry_of_couplings(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet from multicov.align_io import load_fasta from multicov.statistics import Statistics, MaxentModel from os.path import join align1 = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align3 = load_fasta(join('test_data', 'test_aln2.fasta'), rna_alphabet, invalid_letter_policy='uppergap') align = Alignment(align1) align.add(align2).add(align3) stats = Statistics(align, regularization_amount=0.5) maxent = MaxentModel(stats) self.assertLess(np.max(np.abs(maxent.couplings - maxent.couplings.T)), 1e-10) self.assertSequenceEqual( np.shape(maxent.couplings), 2 * [ 4 * (align1.get_width() + align3.get_width()) + 20 * align2.get_width() ])
def test_freq2_on_multi_alpha(self): from multicov.alphabet import protein_alphabet, dna_alphabet from multicov.align_io import load_fasta from multicov.statistics import Statistics from os.path import join align = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align.add(align2) stats = Statistics(align) expected_f2 = _slow_get_freq2(align) self.assertTrue(np.allclose(stats.freq2, expected_f2))
def test_mask_upper(self): from multicov.alignment import ReferenceMapping from multicov.align_io import load_fasta from multicov.alphabet import protein_alphabet align = load_fasta(os.path.join('test_data', 'test_aln3.fasta'), protein_alphabet, invalid_letter_policy='upper', mask_fct='upper') align0 = load_fasta(os.path.join('test_data', 'test_aln3.fasta'), protein_alphabet, invalid_letter_policy='unchanged') mask = [not _.islower() and _ != '.' for _ in align0[0, :]] expected = align0.truncate_columns(mask) expected.reference = ReferenceMapping( list(range(expected.data.shape[1]))) self.assertEqual(align, expected)
def test_replace_invalid_by_uppercase_then_leave(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import dna_alphabet align = load_fasta(os.path.join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='upper') expected = Alignment(['GATTACA', 'ACCA--T', 'G.C-A-C'], dna_alphabet) expected.annotations['name'] = ['one', 'sequence', 'one line'] self.assertEqual(align, expected)
def test_dna_unchanged_invalid(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import dna_alphabet align = load_fasta(os.path.join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='unchanged') expected = Alignment(['GATTACA', 'ACCA--T', 'G.c-a-c'], dna_alphabet) expected.annotations['name'] = ['one', 'sequence', 'one line'] self.assertEqual(align, expected)
def test_multi_alpha(self): from multicov.alphabet import protein_alphabet, dna_alphabet from multicov.align_io import load_fasta from multicov.statistics import Statistics from os.path import join align = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align.add(align2) alpha = 0.6 stats1 = Statistics(align) stats2 = Statistics(align, regularization_amount=alpha, regularizer='pseudocount') bkg_freq1 = np.hstack( np.ones(width * alphabet.size(no_gap=True)) / alphabet.size() for alphabet, width in align.alphabets) bkg_freq2 = np.outer(bkg_freq1, bkg_freq1) freq1 = (1 - alpha) * stats1.freq1 + alpha * bkg_freq1 freq2 = (1 - alpha) * stats1.freq2 + alpha * bkg_freq2 n_letts = np.hstack(width * [alphabet.size(no_gap=True)] for alphabet, width in align.alphabets) idxs0 = np.hstack(([0], np.cumsum(n_letts)[:-1])) for idx0, n_lett in zip(idxs0, n_letts): idxs = slice(idx0, idx0 + n_lett) # noinspection PyUnresolvedReferences freq2[idxs, idxs] = np.diag(freq1[idxs]) cmat = freq2 - np.outer(freq1, freq1) self.assertTrue(np.allclose(freq1, stats2.freq1)) # noinspection PyTypeChecker self.assertTrue(np.allclose(freq2, stats2.freq2)) # noinspection PyTypeChecker self.assertTrue(np.allclose(cmat, stats2.cmat))
def test_protein_unchanged_invalid(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import protein_alphabet align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='unchanged') expected = Alignment(['IVGGYTCQ', 'XVGGTEAQ', 'IGG-KDT-'], alphabet=protein_alphabet) expected.annotations['name'] = ['seq1', 'seq2', 'seq3'] self.assertEqual(align, expected)
def test_protein_keep_annot_ws(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import protein_alphabet align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'), protein_alphabet, strip_ws_in_annot=False) expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], alphabet=protein_alphabet) expected.annotations['name'] = ['seq1 ', ' seq2', 'seq3'] self.assertEqual(align, expected)
def test_cmat_on_multi_alpha(self): from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet from multicov.align_io import load_fasta from multicov.statistics import Statistics from os.path import join align = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align3 = load_fasta(join('test_data', 'test_aln2.fasta'), rna_alphabet, invalid_letter_policy='uppergap') align.add(align2).add(align3) stats = Statistics(align) expected_f1 = _slow_get_freq1(align) expected_f2 = _slow_get_freq2(align) expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1) self.assertTrue(np.allclose(stats.cmat, expected_cmat))
def test_gap_gauge(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet from multicov.align_io import load_fasta from multicov.statistics import Statistics, MaxentModel from os.path import join align1 = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align3 = load_fasta(join('test_data', 'test_aln2.fasta'), rna_alphabet, invalid_letter_policy='uppergap') align = Alignment(align1) align.add(align2).add(align3) stats = Statistics(align, regularization_amount=0.5) maxent = MaxentModel(stats) energies = maxent.score([align.get_width() * '-']) self.assertLess(np.max(np.abs(energies)), 1e-10)
def test_mask_from_first_seq(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import protein_alphabet from numpy import in1d align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='unchanged', mask_fct=lambda s: ~in1d(list(s), ['V', 'G'])) expected = Alignment(['IYTCQ', 'XTEAQ', 'IKDT-'], alphabet=protein_alphabet) expected.annotations['name'] = ['seq1', 'seq2', 'seq3'] self.assertEqual(align, expected)