def read_alignment(self, aln_format="psicov", max_gap_pos=100, max_gap_seq=100): self.msa = io.read_msa(self.alignment_file, aln_format) if max_gap_seq < 100: self.msa = gaps.remove_gapped_sequences(self.msa, max_gap_seq) self.max_gap_seq = max_gap_seq if max_gap_pos < 100: self.msa, self.gapped_positions = gaps.remove_gapped_positions( self.msa, max_gap_pos) self.max_gap_pos = max_gap_pos self.N = self.msa.shape[0] self.L = self.msa.shape[1] self.diversity = np.sqrt(self.N) / self.L self.neff_entropy = ccmpred.weighting.get_HHsuite_neff(self.msa) print( "{0} is of length L={1} and there are {2} sequences in the alignment." .format(self.protein, self.L, self.N)) print( "Alignment has diversity [sqrt(N)/L]={0} and Neff(HHsuite-like)={1}." .format(np.round(self.diversity, decimals=3), np.round(self.neff_entropy, decimals=3)))
def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format, max_gap_pos, plot_file): #read alignment try: alignment = io.read_msa(alignment_file, aln_format) except OSError as e: print("Problems reading alignment file {0}: {1}!".format(alignment_file, e)) sys.exit(0) try: sampled_alignment = io.read_msa(sample_aln_file, aln_format) except OSError as e: print("Problems reading alignment file {0}: {1}!".format(sample_aln_file, e)) sys.exit(0) #Remove positions with > MAX_GAP_POS % gaps if max_gap_pos < 100: alignment, gapped_positions = gaps.remove_gapped_positions(alignment, max_gap_pos) non_gapped_positions = [i for i in range(sampled_alignment.shape[1]) if i not in gapped_positions] sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions]) # compute sequence weights for observed sequences weights = ccmpred.weighting.weights_simple(alignment, 0.8) # compute observed amino acid frequencies pseudocounts = PseudoCounts(alignment, weights) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) single_freq_observed, pairwise_freq_observed = pseudocounts.freqs # compute sequence weights for sampled sequences (usually all sampled sequences obtain weight = 1 ) weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8) # compute sampled amino acid frequencies pseudocounts = PseudoCounts(sampled_alignment, weights_sampled) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs # degap the frequencies (ignore gap frequencies) single_freq_observed = pseudocounts.degap(single_freq_observed, False) single_freq_sampled = pseudocounts.degap(single_freq_sampled, False) pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False) pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False) # plot plot.plot_empirical_vs_model_statistics( single_freq_observed, single_freq_sampled, pairwise_freq_observed, pairwise_freq_sampled, plot_file)