def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times - Generate a random permutation rand_y of the sequence seq_y using random.shuffle(). - Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix. - Increment the entry score in the dictionary scoring_distribution by one. :rtype : dict :param seq_x: first input sequence :param seq_y: second input sequence :param scoring_matrix: scoring matrix :param num_trials: number of trials :return: dictionary scoring_distribution """ scoring_distribution = {} for idx in range(num_trials): rand_y = list(seq_y) random.shuffle(rand_y) rand_y = "".join(rand_y) a_matrix = soln.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score, dummy1, dummy2 = soln.compute_local_alignment(seq_x, rand_y, scoring_matrix, a_matrix) if score in scoring_distribution: scoring_distribution[score] += 1 else: scoring_distribution[score] = 1 if (idx + 1) % (num_trials / 20) == 0: print "done: ", idx return scoring_distribution
def q2(): def q2_helper(str1, str2): if len(str1) != len(str2): return 0 else: ctr = 0 for char in range(len(str1)): if str1[char] == str2[char]: ctr += 1 return (ctr * 1.0 / len(str1)) * 100 human = data.read_protein(data.HUMAN_EYELESS_URL) fly = data.read_protein(data.FRUITFLY_EYELESS_URL) scores = data.read_scoring_matrix(data.PAM50_URL) c_pax = data.read_protein(data.CONSENSUS_PAX_URL) # get local alignment of human and fly a_matrix = soln.compute_alignment_matrix(human, fly, scores, False) l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix) # removing the dashes l_h = l_h.replace("-", "") l_ff = l_ff.replace("-", "") # get global alignment matrix for each local string and pax pax_a_h_matrix = soln.compute_alignment_matrix(l_h, c_pax, scores, True) pax_a_ff_matrix = soln.compute_alignment_matrix(l_ff, c_pax, scores, True) # compute global alignment h_ga = soln.compute_global_alignment(l_h, c_pax, scores, pax_a_h_matrix) ff_ga = soln.compute_global_alignment(l_ff, c_pax, scores, pax_a_ff_matrix) print "human:\t\t", q2_helper(h_ga[1], h_ga[2]) print "fruit fly:\t", q2_helper(ff_ga[1], ff_ga[2])
def analyze_data(): global human_protein, fruitfly_protein, alignment, scoring_matrix human_protein = read_protein(HUMAN_EYELESS_URL) fruitfly_protein = read_protein(FRUITFLY_EYELESS_URL) scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = p.compute_alignment_matrix(human_protein, fruitfly_protein, scoring_matrix, False) alignment = p.compute_local_alignment(human_protein, fruitfly_protein, scoring_matrix, alignment_matrix)
def q1(): human = data.read_protein(data.HUMAN_EYELESS_URL) fly = data.read_protein(data.FRUITFLY_EYELESS_URL) scores = data.read_scoring_matrix(data.PAM50_URL) a_matrix = soln.compute_alignment_matrix(human, fly, scores, False) print soln.compute_local_alignment(human, fly, scores, a_matrix) a_matrix = soln.compute_alignment_matrix(human, fly, scores, True) print soln.compute_global_alignment(human, fly, scores, a_matrix) # local answer b = (875, 'HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ', 'HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ') # global answer a = (4, 'MQN--------------------------------------S--------------HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ--------------------------------------------M------------GA----DG-----MYDKLRMLN-------G--Q----T---G-S---WGTR---P----G------------W----YPG----T--------------SV---------------P---------G-Q---P--T-------Q-DGCQQ-QE-G-G-GENTNSISSN-GEDSDEAQMRLQLKRKLQRNRTSFTQEQIEALEKEFERTHYPDVFARERLAAKIDLPEARIQVWFSNRRAKWRREEKLRNQRR--Q-----A-----S---N-T--P------SH-I------P----I---SS-S-FSTSVYQP-----I--PQ-PT-TP-V-SSFTSGSMLGR-T-D-----T--AL-T----NT-Y--S-------AL-P---P-M---P-SF-TM-AN--N--LPM-Q------P-P------V-----PS----Q---T-SS-YSC-M-L---PTSPS----V--N-GR--------------------S-YD--T-YT--PPHM------Q-------------T--H-M--NS-Q-P-MGTS--GTT-STGL----ISPGV-S---V----P--VQ-V-P----G-S---EPDMSQ------YWPRLQ', 'MRNLPCLGTAGGSGLGGIAGKPSPTMEAVEASTASHPHSTSSYFATTYYHLTDDECHSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQSTGSGSSSTSAGNSISAKVSVSIGGNVSNVASGSRGTLSSSTDLMQTATPLNSSESGGASNSGEGSEQEAIYEKLRLLNTQHAAGPGPLEPARAAPLVGQSPNHLGTRSSHPQLVHGNHQALQQHQQQSWPPRHYSGSWYPTSLSEIPISSAPNIASVTAYASGPSLAHSLSPPNDIESLASIGHQRNCPVATEDIHLKKELDG-HQSDETGSGEGENSNGGASNIG-NTEDDQARLILKRKLQRNRTSFTNDQIDSLEKEFERTHYPDVFARERLAGKIGLPEARIQVWFSNRRAKWRREEKLRNQRRTPNSTGASATSSSTSATASLTDSPNSLSACSSLLSGSAGGPSVSTINGLSSPSTLSTNVNAPTLGAGIDSSESPTPIPHIRPSCTSDNDNGRQSEDCRRVCSPCPLGVGGHQNTHHIQSNGHAQGHALVPAISPRLNFNSGSFGAMYSNMHHTALSMSDSYGAVTPIPSFNHSAVGPLAPPSPIPQQGDLTPSSLYPCHMTLRPPPMAPAHHHIVPGDGGRPAGVGLGSGQSANLGASCSGSGYEVLSAYALPPPPMASSSAADSSFSAASSASANVTPHHTIAQESCPSPCSSASHFGVAHSSGFSSDPISPAVSSYAHMSYNYASSANTMTPSSASGTSAHVAPGKQQFFASCFYSPWV-')
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Takes as input two sequences seq_x and seq_y, a scoring matrix scoring_matrix, and a number of trials num_trials. This function should return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times: Generate a random permutation rand_y of the sequence seq_y using random.shuffle(). Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix. Increment the entry score in the dictionary scoring_distribution by one. """ scoring_distribution = {} for dummy in range(num_trials): list_y = list(seq_y) random.shuffle(list_y) rand_y = ''.join(list_y) alignment_matrix = p.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = p.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)[0] scoring_distribution[score] = scoring_distribution.get(score, 0) + 1 return scoring_distribution
def q5(): # calculate mean/standard deviation dist2 = {38: 3, 40: 7, 41: 13, 42: 23, 43: 27, 44: 29, 45: 45, 46: 74, 47: 69, 48: 72, 49: 71, 50: 65, 51: 52, 52: 64, 53: 57, 54: 49, 55: 43, 56: 28, 57: 40, 58: 31, 59: 17, 60: 18, 61: 14, 62: 12, 63: 9, 64: 9, 65: 4, 66: 10, 67: 8, 68: 5, 69: 6, 70: 2, 71: 3, 72: 4, 73: 1, 75: 3, 77: 4, 79: 2, 80: 1, 81: 1, 82: 2, 84: 1, 85: 1, 93: 1} avg = numpy.mean(dist2.keys()) std_d = numpy.std(dist2.keys()) print "mean:", avg print "standard deviation: ", std_d # get local scores human = data.read_protein(data.HUMAN_EYELESS_URL) fly = data.read_protein(data.FRUITFLY_EYELESS_URL) scores = data.read_scoring_matrix(data.PAM50_URL) # get local alignment of human and fly a_matrix = soln.compute_alignment_matrix(human, fly, scores, False) l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix) print "local score: ", l_score print "z score: ", (l_score - avg) / std_d