def test_compute_alignment_matrix(self): scoring_matrix_0 = project4.build_scoring_matrix( set(['a', 'b', 'c']), 10, 5, -1) alignment_0 = project4.compute_alignment_matrix('a', 'cab', scoring_matrix_0, global_flag=True) alignment_1 = project4.compute_alignment_matrix('a', 'cab', scoring_matrix_0, global_flag=False) self.assertEqual(alignment_0, [[0, -1, -2, -3], [-1, 5, 9, 8]]) self.assertEqual(alignment_1, [[0, 0, 0, 0], [0, 5, 10, 9]]) scoring_matrix_1 = project4.build_scoring_matrix( set(['a', 'b', 'c']), 10, 5, -1) alignment_2 = project4.compute_alignment_matrix('cc', 'cab', scoring_matrix_1, global_flag=True) alignment_3 = project4.compute_alignment_matrix('cc', 'cab', scoring_matrix_1, global_flag=False) self.assertEqual(alignment_2, [[0, -1, -2, -3], [-1, 10, 9, 8], [-2, 9, 15, 14]]) self.assertEqual(alignment_3, [[0, 0, 0, 0], [0, 10, 9, 8], [0, 10, 15, 14]])
def question2(): """ Compute the global alignments of local human vs concensus PAX domain as well as local fruitfly vs. consensus PAX domain. Return as percentages. """ # Delete any dashes present in local alignments of humans and fruitflies. q1 = question1() dashless_local_human = q1[1].replace('-', '') dashless_local_fly = q1[2].replace('-', '') # Compute global alignments. human_alignment_matrix = project4.compute_alignment_matrix( dashless_local_human, PAX, SCORING_MATRIX, False) fly_alignment_matrix = project4.compute_alignment_matrix( dashless_local_fly, PAX, SCORING_MATRIX, False) human_global = project4.compute_global_alignment(dashless_local_human, PAX, SCORING_MATRIX, human_alignment_matrix) fly_global = project4.compute_global_alignment(dashless_local_fly, PAX, SCORING_MATRIX, fly_alignment_matrix) # Compute percentage of elements in human_global and fly_global that agree with pax human_percent = 0.0 fly_percent = 0.0 for char in range(len(human_global[1])): if human_global[1][char] == human_global[2][char]: human_percent += 1 for char in range(len(fly_global[1])): if fly_global[1][char] == fly_global[2][char]: fly_percent += 1 human_percent = human_percent / len(human_global[1]) fly_percent = fly_percent / len(fly_global[1]) print "human_percent:", human_percent print "fly_percent:", fly_percent
def question2(): """ Code for question 2 """ q1_result = question1() score_mat = read_scoring_matrix(PAM50_URL) human, fruitfly = q1_result[1], q1_result[2] human = human.replace('-', '') fruitfly = fruitfly.replace('-', '') consensus = read_protein(CONSENSUS_PAX_URL) align_m_h = compute_alignment_matrix(human, consensus, score_mat, True) align_m_f = compute_alignment_matrix(fruitfly, consensus, score_mat, True) global_align_hc = compute_global_alignment(human, consensus, score_mat, align_m_h) global_h, global_ch = global_align_hc[1], global_align_hc[2] per1, per2 = 0, 0 for idx in range(len(global_h)): if global_h[idx] == global_ch[idx]: per1 += 1 print float(per1) / len(global_h) * 100 global_align_fc = compute_global_alignment(fruitfly, consensus, score_mat, align_m_f) global_f, global_cf = global_align_fc[1], global_align_fc[2] for idx in range(len(global_f)): if global_f[idx] == global_cf[idx]: per2 += 1 print float(per2) / len(global_f) * 100
def question2(): # QUESTION 2 # delete the dashes in local alignments local_human_new = '' local_fruitfly_new = '' for idx in range(len(local_human)): if (local_human[idx] != '-'): local_human_new += local_human[idx] if (local_fruitfly[idx] != '-'): local_fruitfly_new += local_fruitfly[idx] #print local_human_new #print local_fruitfly_new # compute the global alignment f = open('alg_ConsensusPAXDomain.txt', 'r') consensus = f.read() consensus = consensus[:-2] f.close() align_matrix_human = project4.compute_alignment_matrix( local_human_new, consensus, scores, True) global_align_human = project4.compute_global_alignment( local_human_new, consensus, scores, align_matrix_human) print global_align_human global_human = global_align_human[1] global_consensus_human = global_align_human[2] similarity = 0 for idx in range(len(global_human)): if (global_human[idx] == global_consensus_human[idx]): similarity += 1 human_percentile = similarity / float(len(global_human)) * 100 print human_percentile # align_matrix_fruitfly = project4.compute_alignment_matrix( local_fruitfly_new, consensus, scores, True) global_align_fruitfly = project4.compute_global_alignment( local_fruitfly_new, consensus, scores, align_matrix_fruitfly) print global_align_fruitfly global_fruitfly = global_align_fruitfly[1] global_consensus_fruitfly = global_align_fruitfly[2] similarity = 0 for idx in range(len(global_fruitfly)): if (global_fruitfly[idx] == global_consensus_fruitfly[idx]): similarity += 1 fruitfly_percentile = similarity / float(len(global_fruitfly)) * 100 print fruitfly_percentile
def question2(): # QUESTION 2 # delete the dashes in local alignments local_human_new = '' local_fruitfly_new = '' for idx in range(len(local_human)): if (local_human[idx] != '-'): local_human_new += local_human[idx] if (local_fruitfly[idx] != '-'): local_fruitfly_new += local_fruitfly[idx] #print local_human_new #print local_fruitfly_new # compute the global alignment f = open('alg_ConsensusPAXDomain.txt', 'r') consensus = f.read() consensus = consensus[:-2] f.close() align_matrix_human = project4.compute_alignment_matrix(local_human_new, consensus, scores, True) global_align_human = project4.compute_global_alignment(local_human_new, consensus, scores, align_matrix_human) print global_align_human global_human = global_align_human[1] global_consensus_human = global_align_human[2] similarity = 0 for idx in range(len(global_human)): if (global_human[idx] == global_consensus_human[idx]): similarity += 1 human_percentile = similarity / float(len(global_human)) * 100 print human_percentile # align_matrix_fruitfly = project4.compute_alignment_matrix(local_fruitfly_new, consensus, scores, True) global_align_fruitfly = project4.compute_global_alignment(local_fruitfly_new, consensus, scores, align_matrix_fruitfly) print global_align_fruitfly global_fruitfly = global_align_fruitfly[1] global_consensus_fruitfly = global_align_fruitfly[2] similarity = 0 for idx in range(len(global_fruitfly)): if (global_fruitfly[idx] == global_consensus_fruitfly[idx]): similarity += 1 fruitfly_percentile = similarity / float(len(global_fruitfly)) * 100 print fruitfly_percentile
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Inputs: seq_x, seq_y: character strings that share a common alphabet with scoring_matrix. scoring_matrix: output of build_scoring_matrix. Dictionary of dictionaries whose [seq_x[i]][seq_y[j]] value is the score of the alignment of seq_x[i], seq_y[i]. num_trials: integer number of simulations to run Output: scoring_distribution: a list of scores from the simulations. Randomly shuffle seq_y num_trial times, score the local alignment with seq_x. """ # initialize scores = [] # run trials for trial in range(num_trials): # shuffle seq_y _seq_y = list(seq_y) random.shuffle(_seq_y) rand_y = ''.join(_seq_y) # compute local alignment of seq_x and random permutation of seq_y alignment = seq.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = seq.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment)[0] # update frequency distribution scores.append(score) return scores
def find_scoring_matrix(x, y, med, dim): """ Find the scoring matrix that satisifes the definition of minimum edit distance: |x| + |y| - score(x, y) Inputs: x, y: english strings med: minimum edit distance between x, y dim: range of values to test for diag_score, off_score, dash_score note dash_scores will be <= 0 """ alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) # med(kitten, sitting) = 3 correct = len(x) + len(y) - med # 10 solutions = np.zeros((dim, dim, dim)) for diag in range(dim): for off in range(dim): for dash in range(dim): sm = seq.build_scoring_matrix(alphabet, diag, off, -1 * dash) am = seq.compute_alignment_matrix(x, y, sm) solutions[diag, off, dash] = seq.compute_global_alignment(x, y, sm, am)[0] parameters = np.transpose(np.nonzero(solutions == correct)) parameters[:, 2] *= -1 return parameters
def question_1(): ''' First, load the files HumanEyelessProtein and FruitflyEyelessProtein using the provided code. These files contain the amino acid sequences that form the eyeless proteins in the human and fruit fly genomes, respectively. Then load the scoring matrix PAM50 for sequences of amino acids. This scoring matrix is defined over the alphabet {A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y, V,B,Z,X,-} which represents all possible amino acids and gaps (the "dashes" in the alignment). Next, compute the local alignments of the sequences of HumanEyelessProtein and FruitflyEyelessProtein using the PAM50 scoring matrix and enter the score and local alignments for these two sequences below. Be sure to clearly distinguish which alignment is which and include any dashes ('-') that might appear in the local alignment. ''' human_protein = provided.read_protein(provided.HUMAN_EYELESS_URL) fruitfly_protein = provided.read_protein(provided.FRUITFLY_EYELESS_URL) scoring_matrix = provided.read_scoring_matrix(provided.PAM50_URL) alignment_matrix = project4.compute_alignment_matrix( human_protein, fruitfly_protein, scoring_matrix, False) local_alignment = project4.compute_local_alignment(human_protein, fruitfly_protein, scoring_matrix, alignment_matrix) return local_alignment
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): ''' Helper function for Question 4 Takes as input two sequences seq_x and seq_y, a scoring matrix scoring_matrix, and a number of trials num_trials. This function should return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times: Generate a random permutation rand_y of the sequence seq_y using random.shuffle(). Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix. Increment the entry score in the dictionary scoring_distribution by one. ''' scoring_distribution = {} trial = 0 while trial < num_trials: seq_y_list = list(seq_y) random.shuffle(seq_y_list) rand_y = ''.join(seq_y_list) alignment_matrix = project4.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) if score[0] not in scoring_distribution: scoring_distribution[score[0]] = 1 else: scoring_distribution[score[0]] += 1 trial += 1 print trial return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Function for question 4 """ ## make a copy of seq_y: #new_seq_y = '' #for each_char in seq_y: # new_seq_y += each_char list_seq_y = list(seq_y) scoring_distribution = {} for dummy_idx in range(num_trials): #random.shuffle(new_seq_y) random.shuffle(list_seq_y) new_seq_y = ''.join(list_seq_y) align_matrix = project4.compute_alignment_matrix(seq_x, new_seq_y, scores, False) local_result = project4.compute_local_alignment(seq_x, new_seq_y, scores, align_matrix) if (local_result[0] in scoring_distribution): scoring_distribution[local_result[0]] += 1 else: scoring_distribution[local_result[0]] = 1 print dummy_idx return scoring_distribution
def generate_null_distribution2(seq_x, seq_y, scoring_matrix, num_trials): # This function does work. I don't understand why balta2ar write it this way by using distr.json distr = { } # store the whole distribution {score1: count1, score2: count2, ..., scoren: countn} raw = [ ] # store all the scores: [score1, score2, ..., scoren], could be duplicate try: with open('distr.json') as f: pair = loads(f.read()) return pair['distr'], pair['raw'] except Exception as e: print('can\'t open file', str(e)) for _ in range(num_trials): temp = list(seq_y) shuffle(temp) rand_y = ''.join(temp) align_matrix = compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score, _, _ = compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) if score not in distr: distr[score] = 0 distr[score] += 1 raw.append(score) with open('distr.json', 'w') as f: f.write(dumps({'distr': distr, 'raw': raw})) return distr, raw
def edit_dist(xs, ys): alphabet = ascii_lowercase # what is ascii_lowercase?? scoring = build_scoring_matrix(alphabet, 2, 1, 0) align = compute_alignment_matrix(xs, ys, scoring, True) # True means global alignment. score, _, _ = compute_global_alignment(xs, ys, scoring, align) return len(xs) + len(ys) - score
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Function for question 4 """ ## make a copy of seq_y: #new_seq_y = '' #for each_char in seq_y: # new_seq_y += each_char list_seq_y = list(seq_y) scoring_distribution = {} for dummy_idx in range(num_trials): #random.shuffle(new_seq_y) random.shuffle(list_seq_y) new_seq_y = ''.join(list_seq_y) align_matrix = project4.compute_alignment_matrix( seq_x, new_seq_y, scores, False) local_result = project4.compute_local_alignment( seq_x, new_seq_y, scores, align_matrix) if (local_result[0] in scoring_distribution): scoring_distribution[local_result[0]] += 1 else: scoring_distribution[local_result[0]] = 1 print dummy_idx return scoring_distribution
def question_2(): human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) consensus = read_protein(CONSENSUS_PAX_URL) scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix_local = project4.compute_alignment_matrix(human, fly, scoring_matrix, False) local_aligns = project4.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix_local) human_local_align = local_aligns[1] fly_local_align = local_aligns[2] human_no_dashes = human_local_align.replace('-','') fly_no_dashes = fly_local_align.replace('-','') global_matrix_human_consensus = project4.compute_alignment_matrix(human_no_dashes, consensus, scoring_matrix,True) global_matrix_fly_consensus = project4.compute_alignment_matrix(fly_no_dashes,consensus, scoring_matrix, True) global_align_human_consensus = project4.compute_global_alignment(human_no_dashes,consensus,scoring_matrix,global_matrix_human_consensus) align_global_human = global_align_human_consensus[1] global_align_fly_consensus = project4.compute_global_alignment(fly_no_dashes, consensus,scoring_matrix,global_matrix_fly_consensus) align_global_fly = global_align_fly_consensus[1] count_human = 0 count_fly = 0 #print align_global_human #print align_global_fly #print consensus for pair in zip(align_global_human, consensus): if pair[0] == pair[1]: count_human += 1. for pair in zip(align_global_fly,consensus): if pair[0] == pair[1]: count_fly += 1. human_percentage = (count_human / len(align_global_human)) * 100 fly_percentage = (count_fly / len(align_global_fly)) * 100 print "human percentage: ", human_percentage print "fly percentage: ", fly_percentage
def edit_dist(xs, ys): ''' Helper function for Question 8 ''' alphabet = 'abcdefghijklmnopqrstuvwxyz' scoring = project4.build_scoring_matrix(alphabet, 2, 1, 0) align = project4.compute_alignment_matrix(xs, ys, scoring, True) score, x, y = project4.compute_global_alignment(xs, ys, scoring, align) return len(xs) + len(ys) - score
def question1(): """ Code for quetion 1 """ human = read_protein(HUMAN_EYELESS_URL) fruitfly = read_protein(FRUITFLY_EYELESS_URL) score_mat = read_scoring_matrix(PAM50_URL) align_mat = compute_alignment_matrix(human, fruitfly, score_mat, False) result = compute_local_alignment(human, fruitfly, score_mat, align_mat) return result
def question1(): # QUESTION 1 align_matrix = project4.compute_alignment_matrix(fruitfly_protein, human_protein, scores, False) local_alignment_eyeless = project4.compute_local_alignment(fruitfly_protein, human_protein, scores, align_matrix) # #for each in local_alignment_eyeless: # print each #print local_alignment_eyeless[0] local_human = local_alignment_eyeless[2] local_fruitfly = local_alignment_eyeless[1]
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): distribution = {} bar = progressbar.ProgressBar(max_value=1000) for progress in range(num_trials): bar.update(progress) rand_y = list(seq_y) random.shuffle(rand_y) alignment_matrix = project4.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)[0] distribution[score] = distribution.get(score,0) + 1 save_dict(distribution) return distribution
def question_2(): ''' To continue our investigation, we next consider the similarity of the two sequences in the local alignment computed in Question 1 to a third sequence. The file ConsensusPAXDomain contains a "consensus" sequence of the PAX domain; that is, the sequence of amino acids in the PAX domain in any organism. In this problem, we will compare each of the two sequences of the local alignment computed in Question 1 to this consensus sequence to determine whether they correspond to the PAX domain. ''' consensus = provided.read_protein(provided.CONSENSUS_PAX_URL) score, human_alignment, fruitfly_alignment = question_1() scoring_matrix = provided.read_scoring_matrix(provided.PAM50_URL) # Delete any dashes '-' present in the sequence human = human_alignment.replace('-', '') fruitfly = fruitfly_alignment.replace('-', '') # Compute the global alignment of this dash-less sequence with the # ConsensusPAXDomain sequence. alignment_matrix_human = project4.compute_alignment_matrix( human, consensus, scoring_matrix, True) human_global = project4.compute_global_alignment(human, consensus, scoring_matrix, alignment_matrix_human) alignment_matrix_fruitfly = project4.compute_alignment_matrix( fruitfly, consensus, scoring_matrix, True) fruitfly_global = project4.compute_global_alignment( fruitfly, consensus, scoring_matrix, alignment_matrix_fruitfly) # Compare corresponding elements of these two globally-aligned sequences # (local vs. consensus) and compute the percentage of elements in these two # sequences that agree. human_similarity = compute_similarity(human_global[1], human_global[2]) fruitfly_similarity = compute_similarity(fruitfly_global[1], fruitfly_global[2]) return 'Human:', human_similarity, 'Fruitfly:', fruitfly_similarity
def question1And2(): human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) print(len(human), len(fly)) scoring = read_scoring_matrix(PAM50_URL) local_align_matrix = compute_alignment_matrix(human, fly, scoring, False) score, xs, ys = compute_local_alignment(human, fly, scoring, local_align_matrix) print('Question 1') print('The score of the local alignment is: ', score) print('The sequence for the HumanEyelessProtein is: ', xs) print('The sequence for the FruitflyEyelessProtein is: ', ys) print() print('Question2') consensus = read_protein(CONSENSUS_PAX_URL) # Step1: Delete any dashes '-' present in the sequence. human_nodash = ''.join([x for x in xs if x != '-']) fly_nodash = ''.join([y for y in ys if y != '-']) # Step2: Compute the global alignment of this dash-less sequence with the ConsensusPAXDomain sequence. hc_global_align_matrix = compute_alignment_matrix(human_nodash, consensus, scoring, True) fc_global_align_matrix = compute_alignment_matrix(fly_nodash, consensus, scoring, True) # Step3: Compare corresponding elements of these two globally-aligned sequences (local vs consensus) and # compute the percentage of elements in these two sequences that agree # NOTE: func agreement contains Stpe2 and Step3. hc_agree = agreement(human_nodash, consensus, scoring, hc_global_align_matrix) fc_agree = agreement(fly_nodash, consensus, scoring, fc_global_align_matrix) print('Human vs Consensus agree = %s%%' % hc_agree) print('Fly vs Consensus agree = %s%%' % fc_agree)
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): ''' 1) Generate a random permutation 'rand_y' of the sequence seq_y 2) Compute the maximum value 'score' for the local alignment of seq_x and rand_y using the score matrix 'scoring_matrix' Return local alignment score ''' temp = list(seq_y) random.shuffle(temp) seq_y = ''.join(temp) local_alignment_matrix = project4.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) local_alignment = project4.compute_local_alignment(seq_x, seq_y, scoring_matrix, local_alignment_matrix) return local_alignment[0]
def align_eyeless(scoring_matrix): """ compute the local alignment and score of the human eyeless AA sequence and the drosophila eyeless AA sequence, using the PAM 50 scoring matrix """ # load eyeless AA strings human = read_protein(HUMAN_EYELESS_URL) drosophila = read_protein(FRUITFLY_EYELESS_URL) # compute local alignment matrix la_mtrx = seq.compute_alignment_matrix(human, drosophila, scoring_matrix, False) # compute local alignment return seq.compute_local_alignment(human, drosophila, scoring_matrix, la_mtrx)
def question1(): # QUESTION 1 align_matrix = project4.compute_alignment_matrix(fruitfly_protein, human_protein, scores, False) local_alignment_eyeless = project4.compute_local_alignment( fruitfly_protein, human_protein, scores, align_matrix) # #for each in local_alignment_eyeless: # print each #print local_alignment_eyeless[0] local_human = local_alignment_eyeless[2] local_fruitfly = local_alignment_eyeless[1]
def check_spelling(checked_word, dist, word_list): # scoring matrix for edit distaion # edit distance = |x| + |y| - score(X,Y) # diag_socre = 2, off_diag_score = 1, dash_score = 0 alphabets = set("abcdefghijklmnopqrstuvwxyz") scoring_matrix = project4.build_scoring_matrix(alphabets,2,1,0) string_set = set([]) for word in word_list: alignment_matrix = project4.compute_alignment_matrix(checked_word ,word, scoring_matrix, True) score, _, _ = project4.compute_global_alignment(checked_word, word, scoring_matrix, alignment_matrix) score = len(checked_word) + len(word) - score if score <= dist: string_set.add(word) return string_set
def calculate_similar_ratio(): result = align_human_fly_protein() sequence_human = result[1].replace('-', '') sequence_fly = result[2].replace('-', '') protein_consensus = provided.read_protein(provided.CONSENSUS_PAX_URL) alignment_matrix = project4.compute_alignment_matrix(sequence_human, protein_consensus, scoring_matrix, True) result = project4.compute_global_alignment(sequence_human, protein_consensus, scoring_matrix, alignment_matrix) mark = 0 for idx in range(len(result[1])): if result[1][idx] == result[2][idx]: mark += 1 print mark / float(len(result[1])) protein_consensus = provided.read_protein(provided.CONSENSUS_PAX_URL) alignment_matrix = project4.compute_alignment_matrix(sequence_fly, protein_consensus, scoring_matrix, True) result = project4.compute_global_alignment(sequence_fly, protein_consensus, scoring_matrix, alignment_matrix) mark = 0 for idx in range(len(result[1])): if result[1][idx] == result[2][idx]: mark += 1 print mark / float(len(result[1]))
def pax_domain(scoring_matrix, local_alignment): """ Compare the local alignments of human and drosophila eyeless proteins to the consesus PAX domain by computing a global alignment. Return a tuple of percentages: one for human vs consensus, one for drosophila vs consesus, each of which reports how many AAs are the same. """ # load consesus pax domain pax = read_protein(CONSENSUS_PAX_URL) # remove dashes from local alignemnts (human and drosophila) human = re.sub('-', '', local_alignment[1]) drosophila = re.sub('-', '', local_alignment[2]) # compute global alignment for dash-less local alignments vs consesus human_pax_matrix = seq.compute_alignment_matrix(human, pax, scoring_matrix) human_pax = seq.compute_global_alignment(human, pax, scoring_matrix, human_pax_matrix) drosophila_pax_matrix = seq.compute_alignment_matrix(drosophila, pax, scoring_matrix) drosophila_pax = seq.compute_global_alignment(drosophila, pax, scoring_matrix, drosophila_pax_matrix) # compute counts of elements that agree in the two global alignments n_human_pax = len(human_pax[1]) count_human_pax = 0.0 for aa in range(n_human_pax): if human_pax[1][aa] == human_pax[2][aa]: count_human_pax += 1 n_drosophila_pax = len(drosophila_pax[1]) count_drosophila_pax = 0.0 for aa in range(n_drosophila_pax): if drosophila_pax[1][aa] == drosophila_pax[2][aa]: count_drosophila_pax +=1 # return proportion of agreement for two global alignments return (count_human_pax / n_human_pax, count_drosophila_pax / n_drosophila_pax)
def check_spelling(checked_word, dist, word_list): """ Function for Question 8 """ # we should do some pre-processing with the word_list # only consider the words that has length between |checked_word| +- dist # (2) maybe should not consider the words that have letters not existed # in the checked_word #word_list_new = [] #for each_word in word_list: # if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist)): # word_list_new.append(each_word) alphabet = set([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ]) #print len(alphabet) if (checked_word in word_list): return checked_word score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) words = [] # build a set of chars in checked_word # I can even use a dictionary to check against the number of chars, it # would be more effective checked_word_chars = set(checked_word) num_checks = 0 for each_word in word_list: each_word_chars = set(each_word) num_diffs = 0 for char in each_word_chars: if char not in checked_word_chars: num_diffs += 1 if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist) and num_diffs <= 2): align_matrix = project4.compute_alignment_matrix( checked_word, each_word, score_matrix, True) result = project4.compute_global_alignment(checked_word, each_word, score_matrix, align_matrix) if ((len(checked_word) + len(each_word) - result[0]) <= dist): words.append(each_word) num_checks += 1 print num_checks return words
def question_1(): human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = project4.compute_alignment_matrix(human, fly, scoring_matrix, False) answer = project4.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix) print "score =", answer[0] print "align human = ", answer[1] print "align fly = ", answer[2] return answer[0]
def question7(): """ Question 7 """ alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) #print len(alphabet) score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) test1 = 'abcde' test2 = 'xycdefg' align_matrix = project4.compute_alignment_matrix(test1, test2, score_matrix, True) result = project4.compute_global_alignment(test1, test2, score_matrix, align_matrix) print test1 print test2 print result print len(test1) + len(test2) - result[0]
def check_spelling(checked_word, dist, word_list): """ Returns a set of words from word_list that are dist edit distance from checked_word """ alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) candidates = set([]) for word in word_list: smtrx = seq.build_scoring_matrix(alphabet, 2, 1, 0) amtrx = seq.compute_alignment_matrix(checked_word, word, smtrx) score = seq.compute_global_alignment(checked_word, word, smtrx, amtrx)[0] if len(checked_word) + len(word) - score <= dist: candidates.add(word) return candidates
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): scoring_distribution = {} while num_trials: shuffledlist = list(seq_y) random.shuffle(shuffledlist) rand_y = "".join(shuffledlist) loc_align_matric = student.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score = max(col for row in loc_align_matric for col in row) if score in scoring_distribution.keys(): scoring_distribution[score] += 1 else: scoring_distribution[score] = 1 num_trials -= 1 return scoring_distribution
def calculate_edit_distance(xseq, yseq): ''' Return the edit distance of xseq and yseq http://en.wikipedia.org/wiki/Edit_distance ''' alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-']) scoring_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) global_alignment_matrix = project4.compute_alignment_matrix(xseq, yseq, scoring_matrix, True) global_alignment = project4.compute_global_alignment(xseq, yseq, scoring_matrix,global_alignment_matrix) edit_distance = len(xseq) + len(yseq) - global_alignment[0] #print global_alignment # print edit_distance return edit_distance
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Calculate a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times: Generate a random permutation rand_y of the sequence seq_y using random.shuffle(). Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix. Increment the entry score in the dictionary scoring_distribution by one. Parameters ---------- seq_x: str a sequence seq_y: str another sequence scoring_matrix: dict of dicts the scoring matrix num_trials: int the number of trials Returns ------- scoring_distribution: dict a dictionary scoring_distribution that represents an un-normalized distribution """ scoring_distribution = defaultdict(int) for _ in range(num_trials): rand_y = list(seq_y) shuffle(rand_y) align_mat = compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) alignment = compute_local_alignment(seq_x, rand_y, scoring_matrix, align_mat) score = alignment[0] scoring_distribution[score] += 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Returns a dictionary scoring_distribution that represents an un-normalized distribution based on the given number of trials num_trials. """ scoring_distribution = {} for dummy in range(num_trials): y_list = list(seq_y) random.shuffle(y_list) rand_y = ''.join(y_list) alignment_matrix = project4.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)[0] if score in scoring_distribution.keys(): scoring_distribution[score] = scoring_distribution[score] + 1 else: scoring_distribution[score] = 1 return scoring_distribution
def question1(): """ Compute the local alignments of the sequences of HumanEyelessProtein and FruitflyEyelessProtein using the PAM50 scoring matrix. """ # Compute local alignments. alignment_matrix = project4.compute_alignment_matrix(HUMAN, FLY, SCORING_MATRIX, global_flag=False) local_alignment = project4.compute_local_alignment(HUMAN, FLY, SCORING_MATRIX, alignment_matrix) align_human = local_alignment[1] align_fly = local_alignment[2] print "Human local alignment:", align_human print "Fruit fly local alignment:", align_fly print "score:", local_alignment[0] return (local_alignment[0], align_human, align_fly)
def check_spelling(checked_word, dist, word_list): """ Function for Question 8 """ # we should do some pre-processing with the word_list # only consider the words that has length between |checked_word| +- dist # (2) maybe should not consider the words that have letters not existed # in the checked_word #word_list_new = [] #for each_word in word_list: # if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist)): # word_list_new.append(each_word) alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) #print len(alphabet) if (checked_word in word_list): return checked_word score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) words = [] # build a set of chars in checked_word # I can even use a dictionary to check against the number of chars, it # would be more effective checked_word_chars = set(checked_word) num_checks = 0 for each_word in word_list: each_word_chars = set(each_word) num_diffs = 0 for char in each_word_chars: if char not in checked_word_chars: num_diffs += 1 if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist) and num_diffs <= 2): align_matrix = project4.compute_alignment_matrix(checked_word, each_word, score_matrix, True) result = project4.compute_global_alignment(checked_word, each_word, score_matrix, align_matrix) if ((len(checked_word) + len(each_word) - result[0]) <= dist): words.append(each_word) num_checks += 1 print num_checks return words
def check_spelling(checked_word, dist, word_list): """ Iterates through word_list and returns the set of all words that are within edit distance dist of the string checked_word. """ ans = set([]) scoring_matrix = project4.build_scoring_matrix( 'abcdefghijklmnopqrstuvwxyz', 2, 1, 0) checked_word_length = len(checked_word) for word in word_list: word_length = len(word) alignment_matrix = project4.compute_alignment_matrix( checked_word, word, scoring_matrix, True) global_score = project4.compute_global_alignment( checked_word, word, scoring_matrix, alignment_matrix) edit_dist = checked_word_length + word_length - global_score[0] if edit_dist <= dist: ans.add(word) return ans
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ generate null distribution of amino acid at specific position :param seq_x: seq_x :param seq_y: seq_y :param scoring_matrix: scoring matrix :param num_trials: number of trials :return: a dictionary of scoring_distribution """ scoring_distr= {} for i in xrange(1, num_trials+1): # random seq from seq_y rand_y = ''.join(random.sample(seq_y, len(seq_y))) alignment_matrix = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) result = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) scoring_distr[i]= result[0] return scoring_distr
def check_spelling(check_word, dist, word_list): """ check spelling of check_word :param check_word: word to check :param dist: edit distance :param word_list: list of wrod (dictionary) :return: set of words from word_list that has the distance of 'dist' from check_word """ result =[] alphabet = list(string.ascii_lowercase) score_matrix = student.build_scoring_matrix(alphabet, 2, 1, 0) for each in word_list: alignment_matrix = student.compute_alignment_matrix(each, check_word, score_matrix, True) global_align = student.compute_global_alignment(each, check_word, score_matrix, alignment_matrix) distance = len(each)+len(check_word)-global_align[0] if distance <= dist: result.append(each) return result
def question7(): """ Question 7 """ alphabet = set([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ]) #print len(alphabet) score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) test1 = 'abcde' test2 = 'xycdefg' align_matrix = project4.compute_alignment_matrix(test1, test2, score_matrix, True) result = project4.compute_global_alignment(test1, test2, score_matrix, align_matrix) print test1 print test2 print result print len(test1) + len(test2) - result[0]
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): distr = { } # store the whole distribution {score1: count1, score2: count2, ..., scoren: countn} raw = [ ] # store all the scores: [score1, score2, ..., scoren], could be duplicate for _ in range(num_trials): temp = list(seq_y) shuffle(temp) rand_y = ''.join(temp) align_matrix = compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) # Returns local alignment matrix. score, _, _ = compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) if score not in distr: distr[score] = 0 distr[score] += 1 raw.append(score) return distr, raw
def generate_null_distribution(seq_x, seq_y,scoring_matrix, num_trials): scoring_distribution = {} scores_list = [] for i in range(num_trials): temp = list(seq_y) random.shuffle(temp) rand_y = ''.join(temp) align_matrix = project4.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) local_align = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) score = local_align[0] if score not in scoring_distribution: scoring_distribution[score] = 0 scoring_distribution[score] += 1 scores_list.append(score) return scoring_distribution, scores_list
def check_spelling(checked_word, dist, word_list): """ Iterates through word_list and returns the set of all words that are within edit distance dist of the string checked_word. Parameters ---------- checked_word: str the word to be checked dist: int the edit distance word_list: list a list of words Returns ------- result: list the list of words that are within edit distance of the checked_word. """ alphabets = "abcdefghijklmnopqrstuvwxyz" score_mat = build_scoring_matrix(alphabets, 2, 1, 0) result = [] for word in word_list: align_mat = compute_alignment_matrix(checked_word, word, score_mat, True) score = compute_global_alignment(checked_word, word, score_mat, align_mat)[0] current_dist = len(checked_word) + len(word) - score if current_dist <= dist: result.append(word) return result
def align_human_fly_protein(): alignment_matrix = project4.compute_alignment_matrix(protein_human, protein_fly, scoring_matrix, False) result = project4.compute_local_alignment(protein_human, protein_fly, scoring_matrix, alignment_matrix) return result
Returns: A string representing the protein """ protein_file = urllib2.urlopen(filename) protein_seq = protein_file.read() protein_seq = protein_seq.rstrip() return protein_seq # Q1 #compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) seq_fly = read_protein(FRUITFLY_EYELESS_URL) seq_human = read_protein(HUMAN_EYELESS_URL) score_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = student.compute_alignment_matrix(seq_human, seq_fly, score_matrix, False) result = student.compute_local_alignment(seq_human, seq_fly, score_matrix, alignment_matrix) #print result[0] #human #print result[1] #fly #print result[2] # Q2 seq_pax = read_protein(CONSENSUS_PAX_URL) #fly and pax domain # alignment_matrix_global = student.compute_alignment_matrix(result[2], seq_pax, score_matrix, True) # result2 = student.compute_global_alignment(result[2], seq_pax, score_matrix, alignment_matrix_global)
# read in files as string words = word_file.read() # template lines and solution lines list of line string word_list = words.split('\n') print "Loaded a dictionary with", len(word_list), "words" return word_list # question 1 scoring_matrix = read_scoring_matrix(PAM50_URL) seq_x = read_protein(HUMAN_EYELESS_URL) seq_y = read_protein(FRUITFLY_EYELESS_URL) consensusseq = read_protein(CONSENSUS_PAX_URL) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) score, string_Hu, string_Fr = student.compute_local_alignment( seq_x, seq_y, scoring_matrix, alignment_matrix) print string_Hu newstring_Hu = "" for elem in string_Hu: if elem != '-': newstring_Hu += elem print newstring_Hu newstring_Fr = "" for elem in string_Fr: if elem != '-': newstring_Fr += elem alignment_matrix_Hum_local_Con = student.compute_alignment_matrix(
def edit_distance(seq_x, seq_y): alphabet = string.ascii_lowercase scoring_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) alignment_matrix = project4.compute_alignment_matrix(seq_x, seq_y, scoring_matrix,True) score = project4.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) return len(seq_x) + len(seq_y) - score[0]
# read in files as string words = word_file.read() # template lines and solution lines list of line string word_list = words.split('\n') print "Loaded a dictionary with", len(word_list), "words" return word_list HUMAN_EYELESS_PROTEIN = read_protein(HUMAN_EYELESS_URL) FRUITFLY_EYELESS_PROTEIN = read_protein(FRUITFLY_EYELESS_URL) PAM50_SCORING_MATRIX = read_scoring_matrix(PAM50_URL) CONSENSUS_PAX = read_protein(CONSENSUS_PAX_URL) PAM50_ALIGNMENT_MATRIX = student.compute_alignment_matrix( HUMAN_EYELESS_PROTEIN, FRUITFLY_EYELESS_PROTEIN, PAM50_SCORING_MATRIX, True) SEQ_A = 'HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEKQQ' SEQ_B = 'HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ' print CONSENSUS_PAX (SCORE1, CONSENSUS_PAX1, SEQ_A1) = student.compute_global_alignment(CONSENSUS_PAX, SEQ_A, PAM50_SCORING_MATRIX, PAM50_ALIGNMENT_MATRIX) (SCORE2, CONSENSUS_PAX2, SEQ_B2) = student.compute_global_alignment(CONSENSUS_PAX, SEQ_B, PAM50_SCORING_MATRIX, PAM50_ALIGNMENT_MATRIX) print SEQ_A1 print CONSENSUS_PAX1 print SEQ_B2