def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format, max_gap_pos, plot_file): #read alignment try: alignment = io.read_msa(alignment_file, aln_format) except OSError as e: print("Problems reading alignment file {0}: {1}!".format(alignment_file, e)) sys.exit(0) try: sampled_alignment = io.read_msa(sample_aln_file, aln_format) except OSError as e: print("Problems reading alignment file {0}: {1}!".format(sample_aln_file, e)) sys.exit(0) #Remove positions with > MAX_GAP_POS % gaps if max_gap_pos < 100: alignment, gapped_positions = gaps.remove_gapped_positions(alignment, max_gap_pos) non_gapped_positions = [i for i in range(sampled_alignment.shape[1]) if i not in gapped_positions] sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions]) # compute sequence weights for observed sequences weights = ccmpred.weighting.weights_simple(alignment, 0.8) # compute observed amino acid frequencies pseudocounts = PseudoCounts(alignment, weights) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) single_freq_observed, pairwise_freq_observed = pseudocounts.freqs # compute sequence weights for sampled sequences (usually all sampled sequences obtain weight = 1 ) weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8) # compute sampled amino acid frequencies pseudocounts = PseudoCounts(sampled_alignment, weights_sampled) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs # degap the frequencies (ignore gap frequencies) single_freq_observed = pseudocounts.degap(single_freq_observed, False) single_freq_sampled = pseudocounts.degap(single_freq_sampled, False) pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False) pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False) # plot plot.plot_empirical_vs_model_statistics( single_freq_observed, single_freq_sampled, pairwise_freq_observed, pairwise_freq_sampled, plot_file)
def plot_aminoacid_distribution(alignment_file, aln_format, plot_file): protein = os.path.basename(alignment_file).split(".")[0] #read alignment try: alignment = io.read_msa(alignment_file, aln_format) except OSError as e: print("Problems reading alignment file {0}: {1}!".format(alignment_file, e)) sys.exit(0) N = alignment.shape[0] L = alignment.shape[1] diversity = np.sqrt(N) / L # compute sequence weights weights = ccmpred.weighting.weights_simple(alignment, 0.8, False) # compute frequencies pseudocounts = PseudoCounts(alignment, weights) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) #plot plot.plot_alignment( pseudocounts.counts[0], "Amino Acid Distribution in Alignment for {0} (N={1}, L={2}, diversity={3})".format( protein, N, L, np.round(diversity, decimals=3)), plot_file )
def plot_contact_map(alignment_file, aln_format, braw_file, mat_file, pdb_file, plot_file, entropy_correction, apc, seqsep, contact_threshold): pseudocounts = None mat = None gaps_percentage_plot = None protein = None if entropy_correction and (alignment_file is None or braw_file is None): print("Entropy correction requires specification of alignment file and binary raw couplign file!") sys.exit(1) if alignment_file is not None: protein = os.path.basename(alignment_file).split(".")[0] alignment = io.read_msa(alignment_file, aln_format) # compute sequence weights weights = ccmpred.weighting.weights_simple(alignment, 0.8) # compute frequencies pseudocounts = PseudoCounts(alignment, weights) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) gaps_percentage_plot = plot.plot_percentage_gaps_per_position(pseudocounts.counts[0], plot_file=None) if braw_file is not None: protein = os.path.basename(braw_file).split(".")[0] braw = raw.parse_msgpack(braw_file) meta_info = braw.meta # compute frobenius score from couplings mat = io_cm.frobenius_score(braw.x_pair) if entropy_correction: scaling_factor_eta, mat = io_cm.compute_local_correction( pseudocounts.freqs[0], braw.x_pair, meta_info['workflow'][0]['msafile']['neff'], meta_info['workflow'][0]['regularization']['lambda_pair'], mat, entropy=True ) elif apc: mat = io_cm.apc(mat) if mat_file is not None: protein = os.path.basename(mat_file).split(".")[0] mat, meta_info = io_cm.read_matrix(mat_file) if apc: mat = io_cm.apc(mat) L = len(mat) indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, seqsep) plot_matrix = pd.DataFrame() plot_matrix['residue_i'] = indices_upper_tri_i + 1 plot_matrix['residue_j'] = indices_upper_tri_j + 1 plot_matrix['confidence'] = mat[indices_upper_tri_i, indices_upper_tri_j] if pdb_file is not None: # compute distance map from pdb file observed_distances = io.distance_map(pdb_file, L) plot_matrix['distance'] = observed_distances[indices_upper_tri_i, indices_upper_tri_j] plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist() plot_title="Contact Map for protein {0}".format(protein) # Plot Contact Map plot.plot_contact_map_someScore_plotly(plot_matrix, plot_title, seqsep, gaps_percentage_plot, plot_file)
def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format, plot_file): protein = os.path.basename(alignment_file).split(".")[0] #read alignment try: alignment = io.read_msa(alignment_file, aln_format) except OSError as e: print("Problems reading alignment file {0}: {1}!".format( alignment_file, e)) sys.exit(0) try: sampled_alignment = io.read_msa(sample_aln_file, aln_format) except OSError as e: print("Problems reading alignment file {0}: {1}!".format( sampled_alignment, e)) sys.exit(0) #get alignment statistics N_o = alignment.shape[0] N_s = sampled_alignment.shape[0] L = alignment.shape[1] div = np.round(np.sqrt(N_o) / L, decimals=3) ### alignment # compute sequence weights weights = ccmpred.weighting.weights_simple(alignment, 0.8, False) neff_weights_o = np.round(np.sum(weights), decimals=3) neff_entropy_o = np.round(ccmpred.weighting.get_HHsuite_neff(alignment), decimals=3) # compute frequencies pseudocounts = PseudoCounts(alignment, weights) pseudocounts.calculate_frequencies('uniform_pseudocounts', 1, 1, remove_gaps=False) # get original amino acid frequencies single_freq_observed, pairwise_freq_observed = pseudocounts.freqs ### sampled alignment # compute sequence weights weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8, False) neff_weights_s = np.round(np.sum(weights_sampled), decimals=3) neff_entropy_s = np.round( ccmpred.weighting.get_HHsuite_neff(sampled_alignment), decimals=3) # compute frequencies pseudocounts = PseudoCounts(sampled_alignment, weights_sampled) pseudocounts.calculate_frequencies('uniform_pseudocounts', 1, 1, remove_gaps=False) # get amino acid frequencies single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs # degap the frequencies (ignore gap frequencies) single_freq_observed = pseudocounts.degap(single_freq_observed, False) single_freq_sampled = pseudocounts.degap(single_freq_sampled, False) pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False) pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False) # Define plot title title = "Observed and model alignment statistics for {0}".format(protein) title += "<br>original: N={0}, L={1}, div={2}, neff(weights)={3}, neff(entropy)={4}".format( N_o, L, div, neff_weights_o, neff_entropy_o) title += "<br>sampled: N={0}, L={1}, neff(weights)={2}, neff(entropy)={3}".format( N_s, L, neff_weights_s, neff_entropy_s) # plot plot.plot_empirical_vs_model_statistics(single_freq_observed, single_freq_sampled, pairwise_freq_observed, pairwise_freq_sampled, title, plot_file, log=False)