Beispiel #1
0
def goodness_of_fit(ddGs):
	""" 
	Performs Shapiro-Wilk test on array of ddGs 
  
	Parameters: 
	ddGs (array-like): Tabulation of all 19 ddGs 
						for all residues of a given
						protein
  
	Returns: 
	ks (list of float): Test statistic of SW test for all residues
	pvalues (list of float): Pvalue of SW test for all residues

	"""

	fit_gaussian_mixture = FitGaussianMixture(ddGs)
	ks = []
	pvalues = []
	for res, ddGs_res in enumerate(ddGs):
		ddGs_19 = fit_gaussian_mixture.preprocess(ddGs_res)
		k, pvalue = stats.shapiro(ddGs_19)

		ks.append(k)
		pvalues.append(pvalue)
	return ks, pvalues
def plotout(protein_name, rejected_contacts, all_contacts):
    plt.title(protein_name)
    plt.boxplot([rejected_contacts, all_contacts],
                showmeans=True)  #, label = 'all')
    plt.xlabel('residues')
    plt.xticks([1, 2], ['reject the null', 'all'])
    plt.ylabel('frequency of residues')
    k, pvalue = stats.ks_2samp(all_contacts,
                               rejected_contacts)  #only CAII significant
    plt.legend(title='KS = {0:.2f} ({1:.2E})'.format(k, pvalue))
    plt.show()


if __name__ == "__main__":
    ddGs_exp = read_in_experiment()  #ordered by res-1
    fit_gaussian_mixture = FitGaussianMixture(ddGs_exp)

    ks = []
    pvalues = []
    residues_ref = []
    for res, ddGs_res in enumerate(ddGs_exp):
        ddGs_19 = fit_gaussian_mixture.preprocess(ddGs_res)
        if len(ddGs_19) == 17:
            k, pvalue = stats.shapiro(ddGs_19)
            ks.append(k)
            pvalues.append(pvalue)
            residues_ref.append(res + 1)  #account for res-1

    residues_fail = np.where(np.array(pvalues) < 0.05)[0]

    d_contacts = get_contacts('1pga')
Beispiel #3
0
                linestyle='--',
                color='k')

    plt.xticks(x, pretty_proteins)  # rotation = 20)
    plt.ylabel('K-S test statistic')
    plt.legend()
    plt.show()


if __name__ == '__main__':
    data_file = 'Tokuriki_2007.xlsx'
    df = pd.read_excel(data_file)

    ks = []
    ks2 = []
    for protein_name in proteins:
        protein = df[df['Name'] == protein_name]

        ddGs = get_protein_ddG(protein)
        fit_gaussian_mixture = FitGaussianMixture(ddGs)
        ddGs_overall = fit_gaussian_mixture.preprocess(ddGs)  #destroys shape

        yPred = generate_dist(ddGs, fit_gaussian_mixture)

        k, pvalue = stats.ks_2samp(fit_gaussian_mixture.hist, yPred)

        ks.append(k)
        ks2.append(run_gaussian_mixture(fit_gaussian_mixture)[0])

    plotout(np.arange(len(proteins)), ks, ks2)
Beispiel #4
0
	del_list = []
	for res, ddG_res in enumerate(ddGs):
		ddGs_17 = fit_gaussian_mixture.preprocess(ddG_res)
		if len(ddGs_17)!=17:
			del_list.append(res)
 	return del_list

def plotout(ddGs_exp, fit_gaussian_mixture):
	plt.title('$\\beta$1 domain of protein G')
	plt.hist(ddGs_exp[~np.isnan(ddGs_exp)], fit_gaussian_mixture.bins, density = True)	
	plt.plot(fit_gaussian_mixture.x, yPred, color='k', label = 'N-Gaussian\nD = {0:.2f}\np-value = {1:.2E}'.format(k, KS_pvalue(k, len(fit_gaussian_mixture.x), len(fit_gaussian_mixture.x))))
	plt.plot(fit_gaussian_mixture.x, gaussian, color='r', label = 'bi-Gaussian\nD = {0:.2f}\np-value = {1:.2E}'.format(ks2, KS_pvalue(ks2, len(fit_gaussian_mixture.x), len(fit_gaussian_mixture.x))))
	plt.xlabel('$\Delta \Delta G$ (kcal/mol)')
	plt.ylabel('probability')
	plt.legend()
	plt.show()

if '__main__':
	ddGs_exp = read_in_experiment()
		
	fit_gaussian_mixture = FitGaussianMixture(ddGs_exp)
	del_list = parse(ddGs_exp)
	ddGs_exp = np.delete(ddGs_exp, del_list, axis = 0)

	yPred = generate_dist(ddGs_exp, fit_gaussian_mixture)

	k, pvalue = stats.ks_2samp(fit_gaussian_mixture.hist, yPred)
	ks2, pvalue2, gaussian = run_gaussian_mixture(fit_gaussian_mixture)

	plotout(ddGs_exp, fit_gaussian_mixture)