def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ null distribution generator """ scoring_distribution = {} for dummy_trial in range(num_trials): y_index = range(len(seq_y)) # shuffle the y sequence random.shuffle(y_index) rand_y = "" for index in y_index: rand_y += seq_y[index] # compute local alignment matrix align_matrix = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) # compute local alignment score score, x_align, y_align = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) if scoring_distribution.has_key(score): scoring_distribution[score] += 1 else: scoring_distribution[score] = 1 return scoring_distribution
def question2(): scoring_matrix =read_scoring_matrix(PAM50_URL) human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) # for question 3 # acids = 'ACBEDGFIHKMLNQPSRTWVYXZ' # hlen = len(human) # flen = len(fly) # human_random, fly_random = '', '' # for dummy_i in xrange(hlen): # human_random = human_random + human[random.randint(1,23)] # for dummy_i in xrange(flen): # fly_random = fly_random + fly[random.randint(1,23)] # human = human_random # fly = fly_random consensusPAX = read_protein(CONSENSUS_PAX_URL) alignment_matrix = student.compute_alignment_matrix(human, fly, scoring_matrix, False) local_result = student.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix) local_human = ''.join(local_result[1].split('-')) local_fly = ''.join(local_result[2].split('-')) human_P = student.compute_alignment_matrix(local_human,consensusPAX,scoring_matrix, True) human_result = student.compute_global_alignment(local_human,consensusPAX, scoring_matrix, human_P) fly_P = student.compute_alignment_matrix(local_fly,consensusPAX, scoring_matrix, True) fly_result = student.compute_global_alignment(local_fly,consensusPAX, scoring_matrix, fly_P) total = len(consensusPAX) human_count, fly_count =0, 0 for dummy_i in xrange(total): if human_result[1][dummy_i] == human_result[2][dummy_i]: human_count += 1 if fly_result[1][dummy_i] ==fly_result[2][dummy_i]: fly_count += 1 print human_count * 1.0 / total print fly_count * 1.0 / total
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ input : two sequences, scoring matrix and number of trial output : dictionary of scoring_distribution """ dict = {} test = 1 seq_list = list(seq_y) while test < num_trials: # shuffle seq y random.shuffle(seq_list) rand_y = ''.join(seq_list) # local alignment of seq_x and rand_y alignment_matrix = student.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) local_alignment = student.compute_local_alignment( seq_x, rand_y, scoring_matrix, alignment_matrix) # add score to dict dict[test] = local_alignment[0] # update test time test += 1 return dict
def run_q1(): seq_x = read_protein(HUMAN_EYELESS_URL) seq_y = read_protein(FRUITFLY_EYELESS_URL) scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = student.compute_alignment_matrix( seq_x, seq_y, scoring_matrix, False) return student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix)
def check_spelling(checked_word, dist, word_list): answer = [] for word in word_list: alignment_matrix = student.compute_alignment_matrix( checked_word, word, scoring_matrix, True) result = student.compute_local_alignment(checked_word, word, scoring_matrix, alignment_matrix) if (len(checked_word) + len(word) - result[0]) <= dist: answer.append(word) return answer
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): scoring_distribution = {} rand_y = list(seq_y[:]) for trial in range(num_trials): random.shuffle(rand_y) alignment_matrix = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)[0] if score in scoring_distribution: scoring_distribution[score] += 1 else: scoring_distribution[score] = 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): scoring_distribution = dict() for num in xrange(num_trials): rand_y = ''.join(random.sample(seq_y, len(seq_y))) align = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) result = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, align) score = result[0] if score not in scoring_distribution: scoring_distribution[score] = 1 else: scoring_distribution[score] += 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): from collections import defaultdict scoring_distribution = defaultdict(int) for trial in range(num_trials): chars = list(seq_y) random.shuffle(chars) rand_y = ''.join(chars) local_alignment_matrix = student.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, local_alignment_matrix)[0] scoring_distribution[score] += 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): scoring_distribution = {} rand_y = list(seq_y[:]) for trial in range(num_trials): random.shuffle(rand_y) alignment_matrix = student.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)[0] if score in scoring_distribution: scoring_distribution[score] += 1 else: scoring_distribution[score] = 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): scoring_distribution = {} list_y = list(seq_y) for trial in range(num_trials): temp_y = list_y random.shuffle(temp_y) rand_y = ''.join(temp_y) alignment_matrix = student.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score, _, _ = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) scoring_distribution[score] = scoring_distribution.get(score, 0) + 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials) : score_distribution = {} seq_y = list(seq_y) for count in range(num_trials) : print count random.shuffle(seq_y) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) score, _, _ = student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) if score not in score_distribution : score_distribution[score] = 0 score_distribution[score] += 1 return score_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trial): scoring_distribution = dict() for dummy_idx in range(num_trial): rand_y = list(seq_y) random.shuffle(rand_y) rand_y = ''.join(rand_y) alignment_matrix = student.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) result = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) score = result[0] if scoring_distribution.has_key(score): scoring_distribution[score] += 1 else: scoring_distribution[score] = 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ return a dictionary scoring_distribution that represents an un-normalized distribution """ distribution = {} for trial in range(num_trials): start = time.time() rand_y = list(seq_y) random.shuffle(rand_y) rand_y = ''.join(rand_y) alignment_matrix = alg_project4_solution.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) alignment = alg_project4_solution.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) score = alignment[0] if score in distribution: distribution[score] += 1 else: distribution[score] = 1 return distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ return a dictionary scoring_distribution that represents an un-normalized distribution """ distribution = {} for trial in range(num_trials): start = time.time() rand_y = list(seq_y) random.shuffle(rand_y) rand_y = ''.join(rand_y) alignment_matrix = alg_project4_solution.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) alignment = alg_project4_solution.compute_local_alignment( seq_x, rand_y, scoring_matrix, alignment_matrix) score = alignment[0] if score in distribution: distribution[score] += 1 else: distribution[score] = 1 return distribution
def question1(): """ determine local alignment of human and fruitfly eyeless protein """ # load sequences and scoring matrix score_matrix = read_scoring_matrix(PAM50_URL) human_eyeless = read_protein(HUMAN_EYELESS_URL) fruitfly_eyeless = read_protein(FRUITFLY_EYELESS_URL) # compute local alignment matrix align_matrix = student.compute_alignment_matrix(human_eyeless, fruitfly_eyeless, score_matrix, False) # compute local alignment score and sequences score, human_align, fruitfly_align = student.compute_local_alignment(human_eyeless, fruitfly_eyeless, score_matrix, align_matrix) print "Score: " + str(score) print "Human: " + human_align print "FrFly: " + fruitfly_align return
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ str, str, dict of dict, int -> dict Takes two sequences, a scoring matrix, and a number of trials, and returns a dictionary of unnormalized """ scoring_distribution = {} for trial in range(num_trials): list_y = list(seq_y) random.shuffle(list_y) rand_y = ''.join(list_y) alignment_matrix = student.compute_alignment_matrix( human, rand_y, scoring_matrix, False) alignment = student.compute_local_alignment(human, rand_y, scoring_matrix, alignment_matrix) score = alignment[0] if score in scoring_distribution: scoring_distribution[score] += 1 else: scoring_distribution[score] = 1 return scoring_distribution
def q3() : len_gen = len(seq_x) seq_x = [] seq_y = [] for _ in range(len_gen) : seq_x.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ")) seq_y.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ")) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) score, human_aligen, fruit_aligen = student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) print score print human_aligen.replace('-', '') print fruit_aligen human_aligen = human_aligen.replace('-', '') fruit_aligen = fruit_aligen.replace('-', '') pax = read_protein(CONSENSUS_PAX_URL) alignment_matrix = student.compute_alignment_matrix(human_aligen, pax, scoring_matrix, False) score, h1, h2 = student.compute_global_alignment(human_aligen, pax, scoring_matrix, alignment_matrix) print len(h1), len(h2) same = 0 for i in range(len(h1)) : if h1[i] == h2[i] : same += 1 print same * 1.0 / len(h1) alignment_matrix = student.compute_alignment_matrix(fruit_aligen, pax, scoring_matrix, False) score, f1, f2 = student.compute_global_alignment(fruit_aligen, pax, scoring_matrix, alignment_matrix) print len(f1), len(f2) same = 0 for i in range(len(f1)) : if f1[i] == f2[i] : same += 1 print same * 1.0 / len(f1)
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ input: two sequences, scoring matrix, number of trials. A trial is defined as: 1. Generate a random permutation rand_y of the sequence seq_y using random.shuffle(). 2. Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix. 3. Increment the entry score in the dictionary scoring_distribution by one. output: a dictionary scoring_distribution that represents an un-normalized distribution """ scoring_distribution = {} for i in range(num_trials): rand_y = random.sample(seq_y, len(seq_y)) local_S = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) local_alignment = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, local_S) if local_alignment[0] in scoring_distribution: scoring_distribution[local_alignment[0]] += 1 else: scoring_distribution[local_alignment[0]] = 1 return scoring_distribution
print "Loaded a dictionary with", len(word_list), "words" return word_list # Q1 HumanEyelessProtein = read_protein(HUMAN_EYELESS_URL) FruitflyEyelessProtein = read_protein(FRUITFLY_EYELESS_URL) PAM50 = read_scoring_matrix(PAM50_URL) alignment_matrix_Q1 = student.compute_alignment_matrix(HumanEyelessProtein, FruitflyEyelessProtein, PAM50, False) result_Q1 = student.compute_local_alignment(HumanEyelessProtein, FruitflyEyelessProtein, PAM50, alignment_matrix_Q1) # Q2 TempHumanSeq = result_Q1[1] FruitflySeq = result_Q1[2] HumanSeq = TempHumanSeq[:len(TempHumanSeq) - 3] + TempHumanSeq[len(TempHumanSeq) - 2:] ConsensusPAXDomain = read_protein(CONSENSUS_PAX_URL) alignment_matrix_Q2_Human = student.compute_alignment_matrix( HumanSeq, ConsensusPAXDomain, PAM50, True) alignment_matrix_Q2_Fruitfly = student.compute_alignment_matrix( FruitflySeq, ConsensusPAXDomain, PAM50, True)
word_file = open(filename) # read in files as string words = word_file.read() # template lines and solution lines list of line string word_list = words.split('\n') print "Loaded a dictionary with", len(word_list), "words" return word_list scoring_matrix = read_scoring_matrix(PAM50_URL) seq_x = read_protein(HUMAN_EYELESS_URL) seq_y = read_protein(FRUITFLY_EYELESS_URL) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) score, human_aligen, fruit_aligen = student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) print score exit() def q3() : len_gen = len(seq_x) seq_x = [] seq_y = [] for _ in range(len_gen) : seq_x.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ")) seq_y.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ")) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) score, human_aligen, fruit_aligen = student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) print score
from matplotlib import pyplot PAM50_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_PAM50.txt" HUMAN_EYELESS_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_HumanEyelessProtein.txt" FRUITFLY_EYELESS_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_FruitflyEyelessProtein.txt" CONSENSUS_PAX_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_ConsensusPAXDomain.txt" WORD_LIST_URL = "http://storage.googleapis.com/codeskulptor-assets/assets_scrabble_words3.txt" # question 1 hep = alg_alignment.read_protein(HUMAN_EYELESS_URL) fep = alg_alignment.read_protein(FRUITFLY_EYELESS_URL) scoring_matrix = alg_alignment.read_scoring_matrix(PAM50_URL) hep_fep_local_alignment = alg_project4_solution.compute_local_alignment(hep, fep, scoring_matrix, alg_project4_solution.compute_alignment_matrix( hep, fep, scoring_matrix, False)) human_eyeless_fruitfly_local_alignment_score = hep_fep_local_alignment[0] # question 1 answer print "local alignment for human and fruitfly eyeless genome: " + str(hep_fep_local_alignment) # question 2 cpd = alg_alignment.read_protein(CONSENSUS_PAX_URL) hep_local_alignment = hep_fep_local_alignment[1] fep_local_alignment = hep_fep_local_alignment[2] hep_local_alignment_no_dashes = hep_local_alignment.replace('-', '') hep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(hep_local_alignment_no_dashes, cpd, scoring_matrix, alg_project4_solution.compute_alignment_matrix( hep_local_alignment_no_dashes, cpd, scoring_matrix, True))
def run_q1(): seq_x = read_protein(HUMAN_EYELESS_URL) seq_y = read_protein(FRUITFLY_EYELESS_URL) scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) return student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix)
def question1(): scoring_matrix =read_scoring_matrix(PAM50_URL) human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) alignment_matrix = student.compute_alignment_matrix(human, fly, scoring_matrix, False) print student.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix)
import math from matplotlib import pyplot PAM50_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_PAM50.txt" HUMAN_EYELESS_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_HumanEyelessProtein.txt" FRUITFLY_EYELESS_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_FruitflyEyelessProtein.txt" CONSENSUS_PAX_URL = "http://storage.googleapis.com/codeskulptor-alg/alg_ConsensusPAXDomain.txt" WORD_LIST_URL = "http://storage.googleapis.com/codeskulptor-assets/assets_scrabble_words3.txt" # question 1 hep = alg_alignment.read_protein(HUMAN_EYELESS_URL) fep = alg_alignment.read_protein(FRUITFLY_EYELESS_URL) scoring_matrix = alg_alignment.read_scoring_matrix(PAM50_URL) hep_fep_local_alignment = alg_project4_solution.compute_local_alignment( hep, fep, scoring_matrix, alg_project4_solution.compute_alignment_matrix(hep, fep, scoring_matrix, False)) human_eyeless_fruitfly_local_alignment_score = hep_fep_local_alignment[0] # question 1 answer print "local alignment for human and fruitfly eyeless genome: " + str( hep_fep_local_alignment) # question 2 cpd = alg_alignment.read_protein(CONSENSUS_PAX_URL) hep_local_alignment = hep_fep_local_alignment[1] fep_local_alignment = hep_fep_local_alignment[2] hep_local_alignment_no_dashes = hep_local_alignment.replace('-', '') hep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(
# template lines and solution lines list of line string word_list = words.split('\n') print "Loaded a dictionary with", len(word_list), "words" return word_list # Question 1 ################################################################## HUMAN_EYELESS = read_protein(HUMAN_EYELESS_URL) FRUITFLY_EYELESS = read_protein(FRUITFLY_EYELESS_URL) SCORING_MATRIX = read_scoring_matrix(PAM50_URL) ALIGNMENT_MATRIX = student.compute_alignment_matrix(HUMAN_EYELESS, \ FRUITFLY_EYELESS,\ SCORING_MATRIX, False) student.compute_local_alignment(HUMAN_EYELESS, FRUITFLY_EYELESS,\ SCORING_MATRIX, ALIGNMENT_MATRIX) # Question 2 ################################################################## PAX = read_protein(CONSENSUS_PAX_URL) loc_score, loc_human, loc_fly = student.compute_local_alignment(HUMAN_EYELESS,\ FRUITFLY_EYELESS,\ SCORING_MATRIX,\ ALIGNMENT_MATRIX) for align in (loc_human, loc_fly): align = align.replace('-', '') alignment_matrix = student.compute_alignment_matrix( align, PAX, SCORING_MATRIX, True) score, alignment, cons = student.compute_global_alignment( align, PAX, SCORING_MATRIX, alignment_matrix)
def find_local_align(): score_matrix = read_scoring_matrix(PAM50_URL) seq_human = read_protein(HUMAN_EYELESS_URL) seq_fly = read_protein(FRUITFLY_EYELESS_URL) local_alignment_matrix = student.compute_alignment_matrix( seq_human, seq_fly, score_matrix, False) score, seq_loc_human, seq_loc_fly = student.compute_local_alignment( seq_human, seq_fly, score_matrix, local_alignment_matrix) length = len(seq_loc_fly) agree = 0 for idx in range(length): if seq_loc_fly[idx] == seq_loc_human[idx]: agree += 1 print 'Question 1:\n' print 'score:', score, '\nhuman:', seq_loc_human, '\nfly: ', seq_loc_fly print 'Agree percentage: %.2f' % (100 * float(agree) / length) """ Question 1: local alignment score: 875 human: HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ fly: HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ Agree percentage: 93.98% """ ### Question 2 ### print '\nQuestion 2:\n' seq_loc_human = seq_loc_human.replace('-', '') seq_loc_fly = seq_loc_fly.replace('-', '') seq_pax = read_protein(CONSENSUS_PAX_URL) #Q2 # seq_pax = 'ACBEDGFIHKMLNQPSRTWVYXZ' #Q3 for idx in range(2): if idx == 0: seq = seq_loc_human type = 'human' else: seq = seq_loc_fly type = 'fly' global_alignment_matrix = student.compute_alignment_matrix( seq, seq_pax, score_matrix, True) score, x_glbl, pax_glbl = student.compute_global_alignment( seq, seq_pax, score_matrix, global_alignment_matrix) length = len(x_glbl) agree = 0 for idx in range(length): if x_glbl[idx] == pax_glbl[idx]: agree += 1 print 'score:', score, '\n' + type, x_glbl, '\nPAX: ', pax_glbl print type + ' agree percentage: %.2f' % (100 * float(agree) / length) """ Question 2: human score: 613 human: -HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEKQQ PAX: GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR-------- human agree percentage: 72.93 flyscore: 586 fly: -HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ PAX: GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR--------- fly agree percentage: 70.15 """ """
def build_scoring_matrix(alphabet) : """ Make a matrix with diag_score, off_diag_score, dash_score """ matrix = dict() matrix['-'] = dict() matrix['-']['-'] = 0 for rows in alphabet : matrix[rows] = dict() matrix[rows]['-'] = -6 matrix['-'][rows] = -6 for cols in alphabet : if rows == cols : matrix[rows][cols] = 10 else : matrix[rows][cols] = 4 return matrix m = build_scoring_matrix(chset) s = student.compute_alignment_matrix("AA", "TAAT", m, False) print s[0][2] print s[2][0] print s[2][2] print s ali = student.compute_local_alignment("AA", "TAAT", m, s) print ali