Example #1
0
    def read_alignment(self,
                       aln_format="psicov",
                       max_gap_pos=100,
                       max_gap_seq=100):
        self.msa = io.read_msa(self.alignment_file, aln_format)

        if max_gap_seq < 100:
            self.msa = gaps.remove_gapped_sequences(self.msa, max_gap_seq)
            self.max_gap_seq = max_gap_seq

        if max_gap_pos < 100:
            self.msa, self.gapped_positions = gaps.remove_gapped_positions(
                self.msa, max_gap_pos)
            self.max_gap_pos = max_gap_pos

        self.N = self.msa.shape[0]
        self.L = self.msa.shape[1]
        self.diversity = np.sqrt(self.N) / self.L
        self.neff_entropy = ccmpred.weighting.get_HHsuite_neff(self.msa)

        print(
            "{0} is of length L={1} and there are {2} sequences in the alignment."
            .format(self.protein, self.L, self.N))
        print(
            "Alignment has diversity [sqrt(N)/L]={0} and Neff(HHsuite-like)={1}."
            .format(np.round(self.diversity, decimals=3),
                    np.round(self.neff_entropy, decimals=3)))
Example #2
0
def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format, max_gap_pos, plot_file):


    #read alignment
    try:
        alignment = io.read_msa(alignment_file, aln_format)
    except OSError as e:
        print("Problems reading alignment file {0}: {1}!".format(alignment_file, e))
        sys.exit(0)

    try:
        sampled_alignment = io.read_msa(sample_aln_file, aln_format)
    except OSError as e:
        print("Problems reading alignment file {0}: {1}!".format(sample_aln_file, e))
        sys.exit(0)


    #Remove positions with > MAX_GAP_POS % gaps
    if max_gap_pos < 100:
        alignment, gapped_positions = gaps.remove_gapped_positions(alignment, max_gap_pos)
        non_gapped_positions = [i for i in range(sampled_alignment.shape[1]) if i not in gapped_positions]
        sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions])

    # compute sequence weights for observed sequences
    weights = ccmpred.weighting.weights_simple(alignment, 0.8)

    # compute observed amino acid frequencies
    pseudocounts = PseudoCounts(alignment, weights)
    pseudocounts.calculate_frequencies(
        'uniform_pseudocounts', 1, 1, remove_gaps=False
    )
    single_freq_observed, pairwise_freq_observed = pseudocounts.freqs


    # compute sequence weights for sampled sequences (usually all sampled sequences obtain weight = 1 )
    weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8)

    # compute sampled amino acid frequencies
    pseudocounts = PseudoCounts(sampled_alignment, weights_sampled)
    pseudocounts.calculate_frequencies(
        'uniform_pseudocounts', 1, 1, remove_gaps=False
    )
    single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs

    # degap the frequencies (ignore gap frequencies)
    single_freq_observed = pseudocounts.degap(single_freq_observed, False)
    single_freq_sampled = pseudocounts.degap(single_freq_sampled, False)
    pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False)
    pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False)

    # plot
    plot.plot_empirical_vs_model_statistics(
        single_freq_observed, single_freq_sampled,
        pairwise_freq_observed, pairwise_freq_sampled,
        plot_file)