def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): sdist = { } # l = list(seq_y) for i in range(num_trials): l = list(seq_y) random.shuffle(l) rand_y = ''.join(l) am = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) las = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, am) score = las[0] if sdist.has_key(score): sdist[score] = sdist[score] + 1 else: sdist[score] = 1 return sdist
def part_one(): hseq = read_protein(HUMAN_EYELESS_URL) fseq = read_protein(FRUITFLY_EYELESS_URL) pm50 = read_scoring_matrix(PAM50_URL) am50 = student.compute_alignment_matrix(hseq, fseq, pm50, False) res = student.compute_local_alignment(hseq, fseq, pm50, am50 ) print_res(res,["Human :", "Fruitfly:"]) cpax = read_protein(CONSENSUS_PAX_URL) # print 'cpax:', cpax hdseq = res[1].replace('-', '') # print 'h no d:', hdseq amhc = student.compute_alignment_matrix(hdseq, cpax, pm50, True) gah = student.compute_global_alignment(hdseq, cpax, pm50, amhc) # gah = student.compute_global_alignment(cpax, hdseq, pm50, am50) print_res(gah,["Human:", "CPAX :"]) dashh = gah[1].count('-') dashc = gah[2].count('-') print "dash h:", dashh, 'dashc:', dashc, 'lenh:', len(gah[1]), 'lenc:', len(gah[2]) print 'Percentage Agree Human:', 100 * (((len(gah[1]) - (dashh + dashc)) * 1.0)/len(gah[1])),"%" fdseq = res[2].replace('-', '') amfc = student.compute_alignment_matrix(fdseq, cpax, pm50, True) gaf = student.compute_global_alignment(fdseq, cpax, pm50, amfc) print_res(gaf,["Fruitfly:", "CPAX :"]) dashh = gaf[1].count('-') dashc = gaf[2].count('-') print "dash h:", dashh, 'dashc:', dashc, 'lenh:', len(gah[1]), 'lenc:', len(gah[2]) print 'Percentage Agree Fruitfly:', 100 * (((len(gah[1]) - (dashh + dashc)) * 1.0)/len(gah[1])),"%" num_trials = 100 dist = generate_null_distribution(hseq, fseq, pm50, num_trials) disthf = {} for i in dist.keys(): disthf[i] = (dist[i] * 1.0) / num_trials print 'Unnormalized Dist:', dist print 'Normalized Dist:', disthf sum = 0 for i in dist.keys(): sum = dist[i] + sum mean = (sum * 1.0) / len(dist) sd = 0.0 for i in dist.keys(): sd = math.pow(dist[i] - mean, 2) + sd sdv = math.sqrt((sd * 1.0)/len(dist)) print "s:", res[0] print "Mean:", mean print "Standard Deviation:", sdv print "z-score:", (res[0] - mean)/sdv plt.bar(disthf.keys(), disthf.values(), label='statistical hypothesis') plt.xlabel('Scores') plt.ylabel('Fraction of trails') plt.title('Normalized distribution of generate_null_distribution') plt.grid(True) # plt.legend(loc='upper right') plt.show()