Example #1
0
def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format, max_gap_pos, plot_file):


    #read alignment
    try:
        alignment = io.read_msa(alignment_file, aln_format)
    except OSError as e:
        print("Problems reading alignment file {0}: {1}!".format(alignment_file, e))
        sys.exit(0)

    try:
        sampled_alignment = io.read_msa(sample_aln_file, aln_format)
    except OSError as e:
        print("Problems reading alignment file {0}: {1}!".format(sample_aln_file, e))
        sys.exit(0)


    #Remove positions with > MAX_GAP_POS % gaps
    if max_gap_pos < 100:
        alignment, gapped_positions = gaps.remove_gapped_positions(alignment, max_gap_pos)
        non_gapped_positions = [i for i in range(sampled_alignment.shape[1]) if i not in gapped_positions]
        sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions])

    # compute sequence weights for observed sequences
    weights = ccmpred.weighting.weights_simple(alignment, 0.8)

    # compute observed amino acid frequencies
    pseudocounts = PseudoCounts(alignment, weights)
    pseudocounts.calculate_frequencies(
        'uniform_pseudocounts', 1, 1, remove_gaps=False
    )
    single_freq_observed, pairwise_freq_observed = pseudocounts.freqs


    # compute sequence weights for sampled sequences (usually all sampled sequences obtain weight = 1 )
    weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8)

    # compute sampled amino acid frequencies
    pseudocounts = PseudoCounts(sampled_alignment, weights_sampled)
    pseudocounts.calculate_frequencies(
        'uniform_pseudocounts', 1, 1, remove_gaps=False
    )
    single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs

    # degap the frequencies (ignore gap frequencies)
    single_freq_observed = pseudocounts.degap(single_freq_observed, False)
    single_freq_sampled = pseudocounts.degap(single_freq_sampled, False)
    pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False)
    pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False)

    # plot
    plot.plot_empirical_vs_model_statistics(
        single_freq_observed, single_freq_sampled,
        pairwise_freq_observed, pairwise_freq_sampled,
        plot_file)
Example #2
0
def plot_aminoacid_distribution(alignment_file, aln_format, plot_file):

    protein = os.path.basename(alignment_file).split(".")[0]

    #read alignment
    try:
        alignment = io.read_msa(alignment_file, aln_format)
    except OSError as e:
        print("Problems reading alignment file {0}: {1}!".format(alignment_file, e))
        sys.exit(0)

    N = alignment.shape[0]
    L = alignment.shape[1]
    diversity = np.sqrt(N) / L

    # compute sequence weights
    weights = ccmpred.weighting.weights_simple(alignment, 0.8, False)

    # compute frequencies
    pseudocounts = PseudoCounts(alignment, weights)
    pseudocounts.calculate_frequencies(
        'uniform_pseudocounts', 1, 1, remove_gaps=False
    )

    #plot
    plot.plot_alignment(
        pseudocounts.counts[0],
        "Amino Acid Distribution in Alignment for {0} (N={1}, L={2}, diversity={3})".format(
            protein, N, L, np.round(diversity, decimals=3)), plot_file
    )
Example #3
0
def plot_contact_map(alignment_file, aln_format, braw_file, mat_file, pdb_file, plot_file,
                     entropy_correction, apc, seqsep, contact_threshold):

    pseudocounts = None
    mat = None
    gaps_percentage_plot = None
    protein = None


    if entropy_correction and (alignment_file is None or braw_file is None):
        print("Entropy correction requires specification of alignment file and binary raw couplign file!")
        sys.exit(1)

    if alignment_file is not None:
        protein = os.path.basename(alignment_file).split(".")[0]
        alignment = io.read_msa(alignment_file, aln_format)

        # compute sequence weights
        weights = ccmpred.weighting.weights_simple(alignment, 0.8)

        # compute frequencies
        pseudocounts = PseudoCounts(alignment, weights)
        pseudocounts.calculate_frequencies(
            'uniform_pseudocounts', 1, 1, remove_gaps=False
        )

        gaps_percentage_plot = plot.plot_percentage_gaps_per_position(pseudocounts.counts[0], plot_file=None)

    if braw_file is not None:

        protein = os.path.basename(braw_file).split(".")[0]

        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        # compute frobenius score from couplings
        mat = io_cm.frobenius_score(braw.x_pair)

        if entropy_correction:

            scaling_factor_eta, mat = io_cm.compute_local_correction(
                pseudocounts.freqs[0],
                braw.x_pair,
                meta_info['workflow'][0]['msafile']['neff'],
                meta_info['workflow'][0]['regularization']['lambda_pair'],
                mat,
                entropy=True
            )
        elif apc:
            mat = io_cm.apc(mat)

    if mat_file is not None:

        protein = os.path.basename(mat_file).split(".")[0]

        mat, meta_info = io_cm.read_matrix(mat_file)

        if apc:
            mat = io_cm.apc(mat)

    L = len(mat)
    indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, seqsep)

    plot_matrix = pd.DataFrame()
    plot_matrix['residue_i'] = indices_upper_tri_i + 1
    plot_matrix['residue_j'] = indices_upper_tri_j + 1
    plot_matrix['confidence'] = mat[indices_upper_tri_i, indices_upper_tri_j]

    if pdb_file is not None:
        # compute distance map from pdb file
        observed_distances = io.distance_map(pdb_file, L)
        plot_matrix['distance'] = observed_distances[indices_upper_tri_i, indices_upper_tri_j]
        plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist()


    plot_title="Contact Map for protein {0}".format(protein)

    # Plot Contact Map
    plot.plot_contact_map_someScore_plotly(plot_matrix, plot_title, seqsep, gaps_percentage_plot, plot_file)
Example #4
0
def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format,
                              plot_file):

    protein = os.path.basename(alignment_file).split(".")[0]

    #read alignment
    try:
        alignment = io.read_msa(alignment_file, aln_format)
    except OSError as e:
        print("Problems reading alignment file {0}: {1}!".format(
            alignment_file, e))
        sys.exit(0)

    try:
        sampled_alignment = io.read_msa(sample_aln_file, aln_format)
    except OSError as e:
        print("Problems reading alignment file {0}: {1}!".format(
            sampled_alignment, e))
        sys.exit(0)

    #get alignment statistics
    N_o = alignment.shape[0]
    N_s = sampled_alignment.shape[0]
    L = alignment.shape[1]
    div = np.round(np.sqrt(N_o) / L, decimals=3)

    ### alignment

    # compute sequence weights
    weights = ccmpred.weighting.weights_simple(alignment, 0.8, False)
    neff_weights_o = np.round(np.sum(weights), decimals=3)
    neff_entropy_o = np.round(ccmpred.weighting.get_HHsuite_neff(alignment),
                              decimals=3)

    # compute frequencies
    pseudocounts = PseudoCounts(alignment, weights)
    pseudocounts.calculate_frequencies('uniform_pseudocounts',
                                       1,
                                       1,
                                       remove_gaps=False)

    # get original amino acid frequencies
    single_freq_observed, pairwise_freq_observed = pseudocounts.freqs

    ### sampled alignment

    # compute sequence weights
    weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8,
                                                       False)
    neff_weights_s = np.round(np.sum(weights_sampled), decimals=3)
    neff_entropy_s = np.round(
        ccmpred.weighting.get_HHsuite_neff(sampled_alignment), decimals=3)

    # compute frequencies
    pseudocounts = PseudoCounts(sampled_alignment, weights_sampled)
    pseudocounts.calculate_frequencies('uniform_pseudocounts',
                                       1,
                                       1,
                                       remove_gaps=False)

    # get amino acid frequencies
    single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs

    # degap the frequencies (ignore gap frequencies)
    single_freq_observed = pseudocounts.degap(single_freq_observed, False)
    single_freq_sampled = pseudocounts.degap(single_freq_sampled, False)
    pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False)
    pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False)

    # Define plot title
    title = "Observed and model alignment statistics for {0}".format(protein)
    title += "<br>original: N={0}, L={1}, div={2}, neff(weights)={3}, neff(entropy)={4}".format(
        N_o, L, div, neff_weights_o, neff_entropy_o)
    title += "<br>sampled: N={0}, L={1}, neff(weights)={2}, neff(entropy)={3}".format(
        N_s, L, neff_weights_s, neff_entropy_s)

    # plot
    plot.plot_empirical_vs_model_statistics(single_freq_observed,
                                            single_freq_sampled,
                                            pairwise_freq_observed,
                                            pairwise_freq_sampled,
                                            title,
                                            plot_file,
                                            log=False)