def question2(): scoring_matrix =read_scoring_matrix(PAM50_URL) human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) # for question 3 # acids = 'ACBEDGFIHKMLNQPSRTWVYXZ' # hlen = len(human) # flen = len(fly) # human_random, fly_random = '', '' # for dummy_i in xrange(hlen): # human_random = human_random + human[random.randint(1,23)] # for dummy_i in xrange(flen): # fly_random = fly_random + fly[random.randint(1,23)] # human = human_random # fly = fly_random consensusPAX = read_protein(CONSENSUS_PAX_URL) alignment_matrix = student.compute_alignment_matrix(human, fly, scoring_matrix, False) local_result = student.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix) local_human = ''.join(local_result[1].split('-')) local_fly = ''.join(local_result[2].split('-')) human_P = student.compute_alignment_matrix(local_human,consensusPAX,scoring_matrix, True) human_result = student.compute_global_alignment(local_human,consensusPAX, scoring_matrix, human_P) fly_P = student.compute_alignment_matrix(local_fly,consensusPAX, scoring_matrix, True) fly_result = student.compute_global_alignment(local_fly,consensusPAX, scoring_matrix, fly_P) total = len(consensusPAX) human_count, fly_count =0, 0 for dummy_i in xrange(total): if human_result[1][dummy_i] == human_result[2][dummy_i]: human_count += 1 if fly_result[1][dummy_i] ==fly_result[2][dummy_i]: fly_count += 1 print human_count * 1.0 / total print fly_count * 1.0 / total
def check_spelling(checked_word, dist, word_list): alphabet = set('abcdefghijklmnopqrstuvwxyz') scoring_matrix = student.build_scoring_matrix(alphabet, 2, 1, 0) result = list() for word in word_list: align = student.compute_alignment_matrix(checked_word, word, scoring_matrix, True) scores = student.compute_global_alignment(checked_word, word, scoring_matrix, align) if (len(checked_word) + len(word) - scores[0]) <= dist: result.append(word) return result
def question2(): """ determine global alignment of consensusPAX with local human and frfly sequences """ # load sequences and scoring matrix score_matrix = read_scoring_matrix(PAM50_URL) human_seq = "HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEKQQ" frfly_seq = "HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ" consensus_pax = read_protein(CONSENSUS_PAX_URL) # compute human and fruitfly global alignment matrix with consensus pax human_align_matrix = student.compute_alignment_matrix(human_seq, consensus_pax, score_matrix, True) frfly_align_matrix = student.compute_alignment_matrix(frfly_seq, consensus_pax, score_matrix, True) # compute human and fruitfly global alignment sequences score_human, human_align, consensus_align = student.compute_global_alignment(human_seq, consensus_pax, score_matrix, human_align_matrix) score_fly, frfly_align, consensus_align_2 = student.compute_global_alignment(frfly_seq, consensus_pax, score_matrix, frfly_align_matrix) # compute percentages match for human and fruitfly human_count = 0.0 for index in range(len(human_align)): if human_align[index] == consensus_align[index]: human_count += 1 frfly_count = 0.0 for index in range(len(frfly_align)): if frfly_align[index] == consensus_align_2[index]: frfly_count += 1 print "% Human: " + str(human_count / len(human_align) * 100) print "Hmn: " + human_align print "PAX: " + consensus_align print "" print "% FrFly: " + str(frfly_count / len(frfly_align) * 100) print "Fly: " + frfly_align print "PAX: " + consensus_align_2
def q3() : len_gen = len(seq_x) seq_x = [] seq_y = [] for _ in range(len_gen) : seq_x.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ")) seq_y.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ")) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) score, human_aligen, fruit_aligen = student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) print score print human_aligen.replace('-', '') print fruit_aligen human_aligen = human_aligen.replace('-', '') fruit_aligen = fruit_aligen.replace('-', '') pax = read_protein(CONSENSUS_PAX_URL) alignment_matrix = student.compute_alignment_matrix(human_aligen, pax, scoring_matrix, False) score, h1, h2 = student.compute_global_alignment(human_aligen, pax, scoring_matrix, alignment_matrix) print len(h1), len(h2) same = 0 for i in range(len(h1)) : if h1[i] == h2[i] : same += 1 print same * 1.0 / len(h1) alignment_matrix = student.compute_alignment_matrix(fruit_aligen, pax, scoring_matrix, False) score, f1, f2 = student.compute_global_alignment(fruit_aligen, pax, scoring_matrix, alignment_matrix) print len(f1), len(f2) same = 0 for i in range(len(f1)) : if f1[i] == f2[i] : same += 1 print same * 1.0 / len(f1)
def run_q2(origin_seq_x): seq_x = origin_seq_x.replace('-', '') seq_y = 'GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR' scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, True) score, aglin_x, aglin_y = student.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) assert len(aglin_x) == len(aglin_y) length = len(aglin_y) match = 0 print (len(seq_x), len(seq_y), len(aglin_x) , len(aglin_y)) for idx in range(length): if aglin_x[idx] == aglin_y[idx]: match += 1 return match * 1.0 / length
def run_q2(origin_seq_x): seq_x = origin_seq_x.replace('-', '') seq_y = 'GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR' scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = student.compute_alignment_matrix( seq_x, seq_y, scoring_matrix, True) score, aglin_x, aglin_y = student.compute_global_alignment( seq_x, seq_y, scoring_matrix, alignment_matrix) assert len(aglin_x) == len(aglin_y) length = len(aglin_y) match = 0 print(len(seq_x), len(seq_y), len(aglin_x), len(aglin_y)) for idx in range(length): if aglin_x[idx] == aglin_y[idx]: match += 1 return match * 1.0 / length
def question7(seq_x, seq_y): """ determine scoring matrix of edit distance algorithm """ diag_score = 2 off_diag_score = 1 dash_score = 0 alphabet = "abcdefghijklmnopqrstuvwxyz" score_matrix = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score) align_matrix = student.compute_alignment_matrix(seq_x, seq_y, score_matrix, True) score, align_x, align_y = student.compute_global_alignment(seq_x, seq_y, score_matrix, align_matrix) edit_distance = len(seq_x) + len(seq_y) - score print "Edit distance: " + str(edit_distance) print align_x print align_y
def check_spelling(checked_word, dist, word_list): """ input: word, target distance, and word list output: return a subset of word list which the distance between input word < target distance """ result = set() for item in word_list: alignment_matrix = student.compute_alignment_matrix( checked_word, item, scoring_matrix, True) global_alignment = student.compute_global_alignment( checked_word, item, scoring_matrix, alignment_matrix) """ print word_list[index] print alignment_matrix print global_alignment """ if (len(checked_word) + len(item) - global_alignment[0]) <= dist: result = result.union(set([item])) return result
def check_spelling(checked_word, dist, word_list): diag_score = 2 off_diag_score = 1 dash_score = 0 chars = 'abcdefghijklmnopqrstuvwxyz' alphabet = set([char for char in chars]) len_checkedword = len(checked_word) scoring_matrix = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score) similar_word_list = [] for word in word_list: global_alignment_matrix = student.compute_alignment_matrix( checked_word, word, scoring_matrix, True) global_alignment_score = student.compute_global_alignment( checked_word, word, scoring_matrix, global_alignment_matrix)[0] edit_dist = len_checkedword + len(word) - global_alignment_score if edit_dist <= dist: similar_word_list.append(word) return similar_word_list
def check_spelling(checked_word, dist, word_list): """ helper function to determine all words edit distance away """ diag_score = 2 off_diag_score = 1 dash_score = 0 alphabet = "abcdefghijklmnopqrstuvwxyz" score_matrix = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score) words = [] for word in word_list: align_matrix = student.compute_alignment_matrix(checked_word, word, score_matrix, True) score, align_x, align_y = student.compute_global_alignment(checked_word, word, score_matrix, align_matrix) edit_distance = len(checked_word) + len(word) - score if edit_distance <= dist: words.append(word) return words
def check_spelling(checked_word, dist, word_list): """ input: iterates through word_list and returns the set of all words that are within edit distance dist of the string checked_word output: the set of all words that are within edit distance dist of the string checked_word """ result_set = set([]) diag_score = 2 off_diag_score = 1 dash_score = 0 alphabet = set('abcdefghijklmnopqrstuvwxyz') matrix_M = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score) for word in word_list: matrix_S = student.compute_alignment_matrix(checked_word, word, matrix_M, True) global_align_word = student.compute_global_alignment(checked_word, word, matrix_M, matrix_S) if len(checked_word) + len(word) - global_align_word[0] <= dist: result_set.add(word) return result_set
hep_fep_local_alignment = alg_project4_solution.compute_local_alignment(hep, fep, scoring_matrix, alg_project4_solution.compute_alignment_matrix( hep, fep, scoring_matrix, False)) human_eyeless_fruitfly_local_alignment_score = hep_fep_local_alignment[0] # question 1 answer print "local alignment for human and fruitfly eyeless genome: " + str(hep_fep_local_alignment) # question 2 cpd = alg_alignment.read_protein(CONSENSUS_PAX_URL) hep_local_alignment = hep_fep_local_alignment[1] fep_local_alignment = hep_fep_local_alignment[2] hep_local_alignment_no_dashes = hep_local_alignment.replace('-', '') hep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(hep_local_alignment_no_dashes, cpd, scoring_matrix, alg_project4_solution.compute_alignment_matrix( hep_local_alignment_no_dashes, cpd, scoring_matrix, True)) fep_local_alignment_no_dashes = fep_local_alignment.replace('-', '') fep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(fep_local_alignment_no_dashes, cpd, scoring_matrix, alg_project4_solution.compute_alignment_matrix(fep_local_alignment_no_dashes, cpd, scoring_matrix, True)) print hep_no_dashes_cpd_global_alignment print fep_no_dashes_cpd_global_alignment # compute the percentage of elements in these two sequences that agree hndga = hep_no_dashes_cpd_global_alignment[1] hndgacpd = hep_no_dashes_cpd_global_alignment[2] human_consensus_match_count = 0 for idx, elem in enumerate(hndga):
False)) human_eyeless_fruitfly_local_alignment_score = hep_fep_local_alignment[0] # question 1 answer print "local alignment for human and fruitfly eyeless genome: " + str( hep_fep_local_alignment) # question 2 cpd = alg_alignment.read_protein(CONSENSUS_PAX_URL) hep_local_alignment = hep_fep_local_alignment[1] fep_local_alignment = hep_fep_local_alignment[2] hep_local_alignment_no_dashes = hep_local_alignment.replace('-', '') hep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment( hep_local_alignment_no_dashes, cpd, scoring_matrix, alg_project4_solution.compute_alignment_matrix( hep_local_alignment_no_dashes, cpd, scoring_matrix, True)) fep_local_alignment_no_dashes = fep_local_alignment.replace('-', '') fep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment( fep_local_alignment_no_dashes, cpd, scoring_matrix, alg_project4_solution.compute_alignment_matrix( fep_local_alignment_no_dashes, cpd, scoring_matrix, True)) print hep_no_dashes_cpd_global_alignment print fep_no_dashes_cpd_global_alignment # compute the percentage of elements in these two sequences that agree hndga = hep_no_dashes_cpd_global_alignment[1] hndgacpd = hep_no_dashes_cpd_global_alignment[2]
student.compute_local_alignment(HUMAN_EYELESS, FRUITFLY_EYELESS,\ SCORING_MATRIX, ALIGNMENT_MATRIX) # Question 2 ################################################################## PAX = read_protein(CONSENSUS_PAX_URL) loc_score, loc_human, loc_fly = student.compute_local_alignment(HUMAN_EYELESS,\ FRUITFLY_EYELESS,\ SCORING_MATRIX,\ ALIGNMENT_MATRIX) for align in (loc_human, loc_fly): align = align.replace('-', '') alignment_matrix = student.compute_alignment_matrix( align, PAX, SCORING_MATRIX, True) score, alignment, cons = student.compute_global_alignment( align, PAX, SCORING_MATRIX, alignment_matrix) print sum([alignment[i] == cons[i] for i in range(len(alignment))]) / float(len(alignment)) # Question 4 ################################################################## def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): scoring_distribution = {} list_y = list(seq_y) for trial in range(num_trials): temp_y = list_y random.shuffle(temp_y) rand_y = ''.join(temp_y) alignment_matrix = student.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False)
def score(x, y): alignment_matrix = student.compute_alignment_matrix( x, y, scoring_matrix, True) return student.compute_global_alignment(x, y, scoring_matrix, alignment_matrix)[0]
def find_local_align(): score_matrix = read_scoring_matrix(PAM50_URL) seq_human = read_protein(HUMAN_EYELESS_URL) seq_fly = read_protein(FRUITFLY_EYELESS_URL) local_alignment_matrix = student.compute_alignment_matrix( seq_human, seq_fly, score_matrix, False) score, seq_loc_human, seq_loc_fly = student.compute_local_alignment( seq_human, seq_fly, score_matrix, local_alignment_matrix) length = len(seq_loc_fly) agree = 0 for idx in range(length): if seq_loc_fly[idx] == seq_loc_human[idx]: agree += 1 print 'Question 1:\n' print 'score:', score, '\nhuman:', seq_loc_human, '\nfly: ', seq_loc_fly print 'Agree percentage: %.2f' % (100 * float(agree) / length) """ Question 1: local alignment score: 875 human: HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ fly: HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ Agree percentage: 93.98% """ ### Question 2 ### print '\nQuestion 2:\n' seq_loc_human = seq_loc_human.replace('-', '') seq_loc_fly = seq_loc_fly.replace('-', '') seq_pax = read_protein(CONSENSUS_PAX_URL) #Q2 # seq_pax = 'ACBEDGFIHKMLNQPSRTWVYXZ' #Q3 for idx in range(2): if idx == 0: seq = seq_loc_human type = 'human' else: seq = seq_loc_fly type = 'fly' global_alignment_matrix = student.compute_alignment_matrix( seq, seq_pax, score_matrix, True) score, x_glbl, pax_glbl = student.compute_global_alignment( seq, seq_pax, score_matrix, global_alignment_matrix) length = len(x_glbl) agree = 0 for idx in range(length): if x_glbl[idx] == pax_glbl[idx]: agree += 1 print 'score:', score, '\n' + type, x_glbl, '\nPAX: ', pax_glbl print type + ' agree percentage: %.2f' % (100 * float(agree) / length) """ Question 2: human score: 613 human: -HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEKQQ PAX: GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR-------- human agree percentage: 72.93 flyscore: 586 fly: -HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ PAX: GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR--------- fly agree percentage: 70.15 """ """
# Q2 TempHumanSeq = result_Q1[1] FruitflySeq = result_Q1[2] HumanSeq = TempHumanSeq[:len(TempHumanSeq) - 3] + TempHumanSeq[len(TempHumanSeq) - 2:] ConsensusPAXDomain = read_protein(CONSENSUS_PAX_URL) alignment_matrix_Q2_Human = student.compute_alignment_matrix( HumanSeq, ConsensusPAXDomain, PAM50, True) alignment_matrix_Q2_Fruitfly = student.compute_alignment_matrix( FruitflySeq, ConsensusPAXDomain, PAM50, True) result_Q2_Human = student.compute_global_alignment(HumanSeq, ConsensusPAXDomain, PAM50, alignment_matrix_Q2_Human) result_Q2_Fruitfly = student.compute_global_alignment( FruitflySeq, ConsensusPAXDomain, PAM50, alignment_matrix_Q2_Fruitfly) def calculate_score(seq1, seq2): if len(seq1) != len(seq2): print "Wrong!" return else: num_equal = 0 for dummy_idx in range(len(seq1)): if seq1[dummy_idx] == seq2[dummy_idx]: num_equal += 1 return (float(num_equal) / float(len(seq1)) * 100)
def score(x, y): alignment_matrix = student.compute_alignment_matrix(x, y, scoring_matrix, True) return student.compute_global_alignment(x, y, scoring_matrix, alignment_matrix)[0]