def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Inputs: seq_x, seq_y: character strings that share a common alphabet with scoring_matrix. scoring_matrix: output of build_scoring_matrix. Dictionary of dictionaries whose [seq_x[i]][seq_y[j]] value is the score of the alignment of seq_x[i], seq_y[i]. num_trials: integer number of simulations to run Output: scoring_distribution: a list of scores from the simulations. Randomly shuffle seq_y num_trial times, score the local alignment with seq_x. """ # initialize scores = [] # run trials for trial in range(num_trials): # shuffle seq_y _seq_y = list(seq_y) random.shuffle(_seq_y) rand_y = ''.join(_seq_y) # compute local alignment of seq_x and random permutation of seq_y alignment = seq.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = seq.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment)[0] # update frequency distribution scores.append(score) return scores
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Function for question 4 """ ## make a copy of seq_y: #new_seq_y = '' #for each_char in seq_y: # new_seq_y += each_char list_seq_y = list(seq_y) scoring_distribution = {} for dummy_idx in range(num_trials): #random.shuffle(new_seq_y) random.shuffle(list_seq_y) new_seq_y = ''.join(list_seq_y) align_matrix = project4.compute_alignment_matrix( seq_x, new_seq_y, scores, False) local_result = project4.compute_local_alignment( seq_x, new_seq_y, scores, align_matrix) if (local_result[0] in scoring_distribution): scoring_distribution[local_result[0]] += 1 else: scoring_distribution[local_result[0]] = 1 print dummy_idx return scoring_distribution
def generate_null_distribution2(seq_x, seq_y, scoring_matrix, num_trials): # This function does work. I don't understand why balta2ar write it this way by using distr.json distr = { } # store the whole distribution {score1: count1, score2: count2, ..., scoren: countn} raw = [ ] # store all the scores: [score1, score2, ..., scoren], could be duplicate try: with open('distr.json') as f: pair = loads(f.read()) return pair['distr'], pair['raw'] except Exception as e: print('can\'t open file', str(e)) for _ in range(num_trials): temp = list(seq_y) shuffle(temp) rand_y = ''.join(temp) align_matrix = compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score, _, _ = compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) if score not in distr: distr[score] = 0 distr[score] += 1 raw.append(score) with open('distr.json', 'w') as f: f.write(dumps({'distr': distr, 'raw': raw})) return distr, raw
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Function for question 4 """ ## make a copy of seq_y: #new_seq_y = '' #for each_char in seq_y: # new_seq_y += each_char list_seq_y = list(seq_y) scoring_distribution = {} for dummy_idx in range(num_trials): #random.shuffle(new_seq_y) random.shuffle(list_seq_y) new_seq_y = ''.join(list_seq_y) align_matrix = project4.compute_alignment_matrix(seq_x, new_seq_y, scores, False) local_result = project4.compute_local_alignment(seq_x, new_seq_y, scores, align_matrix) if (local_result[0] in scoring_distribution): scoring_distribution[local_result[0]] += 1 else: scoring_distribution[local_result[0]] = 1 print dummy_idx return scoring_distribution
def question_1(): ''' First, load the files HumanEyelessProtein and FruitflyEyelessProtein using the provided code. These files contain the amino acid sequences that form the eyeless proteins in the human and fruit fly genomes, respectively. Then load the scoring matrix PAM50 for sequences of amino acids. This scoring matrix is defined over the alphabet {A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y, V,B,Z,X,-} which represents all possible amino acids and gaps (the "dashes" in the alignment). Next, compute the local alignments of the sequences of HumanEyelessProtein and FruitflyEyelessProtein using the PAM50 scoring matrix and enter the score and local alignments for these two sequences below. Be sure to clearly distinguish which alignment is which and include any dashes ('-') that might appear in the local alignment. ''' human_protein = provided.read_protein(provided.HUMAN_EYELESS_URL) fruitfly_protein = provided.read_protein(provided.FRUITFLY_EYELESS_URL) scoring_matrix = provided.read_scoring_matrix(provided.PAM50_URL) alignment_matrix = project4.compute_alignment_matrix( human_protein, fruitfly_protein, scoring_matrix, False) local_alignment = project4.compute_local_alignment(human_protein, fruitfly_protein, scoring_matrix, alignment_matrix) return local_alignment
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): ''' Helper function for Question 4 Takes as input two sequences seq_x and seq_y, a scoring matrix scoring_matrix, and a number of trials num_trials. This function should return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times: Generate a random permutation rand_y of the sequence seq_y using random.shuffle(). Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix. Increment the entry score in the dictionary scoring_distribution by one. ''' scoring_distribution = {} trial = 0 while trial < num_trials: seq_y_list = list(seq_y) random.shuffle(seq_y_list) rand_y = ''.join(seq_y_list) alignment_matrix = project4.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) if score[0] not in scoring_distribution: scoring_distribution[score[0]] = 1 else: scoring_distribution[score[0]] += 1 trial += 1 print trial return scoring_distribution
def question1(): """ Code for quetion 1 """ human = read_protein(HUMAN_EYELESS_URL) fruitfly = read_protein(FRUITFLY_EYELESS_URL) score_mat = read_scoring_matrix(PAM50_URL) align_mat = compute_alignment_matrix(human, fruitfly, score_mat, False) result = compute_local_alignment(human, fruitfly, score_mat, align_mat) return result
def question1(): # QUESTION 1 align_matrix = project4.compute_alignment_matrix(fruitfly_protein, human_protein, scores, False) local_alignment_eyeless = project4.compute_local_alignment(fruitfly_protein, human_protein, scores, align_matrix) # #for each in local_alignment_eyeless: # print each #print local_alignment_eyeless[0] local_human = local_alignment_eyeless[2] local_fruitfly = local_alignment_eyeless[1]
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): distribution = {} bar = progressbar.ProgressBar(max_value=1000) for progress in range(num_trials): bar.update(progress) rand_y = list(seq_y) random.shuffle(rand_y) alignment_matrix = project4.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)[0] distribution[score] = distribution.get(score,0) + 1 save_dict(distribution) return distribution
def question1(): # QUESTION 1 align_matrix = project4.compute_alignment_matrix(fruitfly_protein, human_protein, scores, False) local_alignment_eyeless = project4.compute_local_alignment( fruitfly_protein, human_protein, scores, align_matrix) # #for each in local_alignment_eyeless: # print each #print local_alignment_eyeless[0] local_human = local_alignment_eyeless[2] local_fruitfly = local_alignment_eyeless[1]
def align_eyeless(scoring_matrix): """ compute the local alignment and score of the human eyeless AA sequence and the drosophila eyeless AA sequence, using the PAM 50 scoring matrix """ # load eyeless AA strings human = read_protein(HUMAN_EYELESS_URL) drosophila = read_protein(FRUITFLY_EYELESS_URL) # compute local alignment matrix la_mtrx = seq.compute_alignment_matrix(human, drosophila, scoring_matrix, False) # compute local alignment return seq.compute_local_alignment(human, drosophila, scoring_matrix, la_mtrx)
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): ''' 1) Generate a random permutation 'rand_y' of the sequence seq_y 2) Compute the maximum value 'score' for the local alignment of seq_x and rand_y using the score matrix 'scoring_matrix' Return local alignment score ''' temp = list(seq_y) random.shuffle(temp) seq_y = ''.join(temp) local_alignment_matrix = project4.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) local_alignment = project4.compute_local_alignment(seq_x, seq_y, scoring_matrix, local_alignment_matrix) return local_alignment[0]
def question_1(): human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = project4.compute_alignment_matrix(human, fly, scoring_matrix, False) answer = project4.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix) print "score =", answer[0] print "align human = ", answer[1] print "align fly = ", answer[2] return answer[0]
def question_2(): human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) consensus = read_protein(CONSENSUS_PAX_URL) scoring_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix_local = project4.compute_alignment_matrix(human, fly, scoring_matrix, False) local_aligns = project4.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix_local) human_local_align = local_aligns[1] fly_local_align = local_aligns[2] human_no_dashes = human_local_align.replace('-','') fly_no_dashes = fly_local_align.replace('-','') global_matrix_human_consensus = project4.compute_alignment_matrix(human_no_dashes, consensus, scoring_matrix,True) global_matrix_fly_consensus = project4.compute_alignment_matrix(fly_no_dashes,consensus, scoring_matrix, True) global_align_human_consensus = project4.compute_global_alignment(human_no_dashes,consensus,scoring_matrix,global_matrix_human_consensus) align_global_human = global_align_human_consensus[1] global_align_fly_consensus = project4.compute_global_alignment(fly_no_dashes, consensus,scoring_matrix,global_matrix_fly_consensus) align_global_fly = global_align_fly_consensus[1] count_human = 0 count_fly = 0 #print align_global_human #print align_global_fly #print consensus for pair in zip(align_global_human, consensus): if pair[0] == pair[1]: count_human += 1. for pair in zip(align_global_fly,consensus): if pair[0] == pair[1]: count_fly += 1. human_percentage = (count_human / len(align_global_human)) * 100 fly_percentage = (count_fly / len(align_global_fly)) * 100 print "human percentage: ", human_percentage print "fly percentage: ", fly_percentage
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Calculate a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times: Generate a random permutation rand_y of the sequence seq_y using random.shuffle(). Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix. Increment the entry score in the dictionary scoring_distribution by one. Parameters ---------- seq_x: str a sequence seq_y: str another sequence scoring_matrix: dict of dicts the scoring matrix num_trials: int the number of trials Returns ------- scoring_distribution: dict a dictionary scoring_distribution that represents an un-normalized distribution """ scoring_distribution = defaultdict(int) for _ in range(num_trials): rand_y = list(seq_y) shuffle(rand_y) align_mat = compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) alignment = compute_local_alignment(seq_x, rand_y, scoring_matrix, align_mat) score = alignment[0] scoring_distribution[score] += 1 return scoring_distribution
def question1(): """ Compute the local alignments of the sequences of HumanEyelessProtein and FruitflyEyelessProtein using the PAM50 scoring matrix. """ # Compute local alignments. alignment_matrix = project4.compute_alignment_matrix(HUMAN, FLY, SCORING_MATRIX, global_flag=False) local_alignment = project4.compute_local_alignment(HUMAN, FLY, SCORING_MATRIX, alignment_matrix) align_human = local_alignment[1] align_fly = local_alignment[2] print "Human local alignment:", align_human print "Fruit fly local alignment:", align_fly print "score:", local_alignment[0] return (local_alignment[0], align_human, align_fly)
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Returns a dictionary scoring_distribution that represents an un-normalized distribution based on the given number of trials num_trials. """ scoring_distribution = {} for dummy in range(num_trials): y_list = list(seq_y) random.shuffle(y_list) rand_y = ''.join(y_list) alignment_matrix = project4.compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) score = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)[0] if score in scoring_distribution.keys(): scoring_distribution[score] = scoring_distribution[score] + 1 else: scoring_distribution[score] = 1 return scoring_distribution
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ generate null distribution of amino acid at specific position :param seq_x: seq_x :param seq_y: seq_y :param scoring_matrix: scoring matrix :param num_trials: number of trials :return: a dictionary of scoring_distribution """ scoring_distr= {} for i in xrange(1, num_trials+1): # random seq from seq_y rand_y = ''.join(random.sample(seq_y, len(seq_y))) alignment_matrix = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) result = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) scoring_distr[i]= result[0] return scoring_distr
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): distr = { } # store the whole distribution {score1: count1, score2: count2, ..., scoren: countn} raw = [ ] # store all the scores: [score1, score2, ..., scoren], could be duplicate for _ in range(num_trials): temp = list(seq_y) shuffle(temp) rand_y = ''.join(temp) align_matrix = compute_alignment_matrix( seq_x, rand_y, scoring_matrix, False) # Returns local alignment matrix. score, _, _ = compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) if score not in distr: distr[score] = 0 distr[score] += 1 raw.append(score) return distr, raw
def question1And2(): human = read_protein(HUMAN_EYELESS_URL) fly = read_protein(FRUITFLY_EYELESS_URL) print(len(human), len(fly)) scoring = read_scoring_matrix(PAM50_URL) local_align_matrix = compute_alignment_matrix(human, fly, scoring, False) score, xs, ys = compute_local_alignment(human, fly, scoring, local_align_matrix) print('Question 1') print('The score of the local alignment is: ', score) print('The sequence for the HumanEyelessProtein is: ', xs) print('The sequence for the FruitflyEyelessProtein is: ', ys) print() print('Question2') consensus = read_protein(CONSENSUS_PAX_URL) # Step1: Delete any dashes '-' present in the sequence. human_nodash = ''.join([x for x in xs if x != '-']) fly_nodash = ''.join([y for y in ys if y != '-']) # Step2: Compute the global alignment of this dash-less sequence with the ConsensusPAXDomain sequence. hc_global_align_matrix = compute_alignment_matrix(human_nodash, consensus, scoring, True) fc_global_align_matrix = compute_alignment_matrix(fly_nodash, consensus, scoring, True) # Step3: Compare corresponding elements of these two globally-aligned sequences (local vs consensus) and # compute the percentage of elements in these two sequences that agree # NOTE: func agreement contains Stpe2 and Step3. hc_agree = agreement(human_nodash, consensus, scoring, hc_global_align_matrix) fc_agree = agreement(fly_nodash, consensus, scoring, fc_global_align_matrix) print('Human vs Consensus agree = %s%%' % hc_agree) print('Fly vs Consensus agree = %s%%' % fc_agree)
def generate_null_distribution(seq_x, seq_y,scoring_matrix, num_trials): scoring_distribution = {} scores_list = [] for i in range(num_trials): temp = list(seq_y) random.shuffle(temp) rand_y = ''.join(temp) align_matrix = project4.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) local_align = project4.compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) score = local_align[0] if score not in scoring_distribution: scoring_distribution[score] = 0 scoring_distribution[score] += 1 scores_list.append(score) return scoring_distribution, scores_list
def compare(n, nh, nf, alpha, cons, scoring, align): ''' n: number of trials nh: number of characters chosen from alpha and assign to x nf: number of characters chosen from alpha and assing to y alpha: original string set: alpha = 'ACBEDGFIHKMLNQPSRTWVYXZ' cons: Consensus strings scoring: scoring matrix for alpha align: alignment matrix????? What is this? Somthing wrong?? ''' ag1, ag2 = [], [] for i in range(n): x, y = rprot(nh, alpha), rprot(nf, alpha) _, xs, ys = compute_local_alignment(x, y, scoring, align) xs_nodash = ''.join([x for x in xs if x != '-']) ys_nodash = ''.join([y for y in ys if y != '-']) ag1.append(agreement(xs_nodash, cons, scoring, align)) ag2.append(agreement(ys_nodash, cons, scoring, align)) hc_agree = sum(ag1) / float(n) fc_agree = sum(ag2) / float(n) print('Random Human vs Consensus agree = %s%%' % hc_agree) print('Random Fly vs Consensus agree = %s%%' % fc_agree)
A string representing the protein """ protein_file = urllib2.urlopen(filename) protein_seq = protein_file.read() protein_seq = protein_seq.rstrip() return protein_seq # Q1 #compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) seq_fly = read_protein(FRUITFLY_EYELESS_URL) seq_human = read_protein(HUMAN_EYELESS_URL) score_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = student.compute_alignment_matrix(seq_human, seq_fly, score_matrix, False) result = student.compute_local_alignment(seq_human, seq_fly, score_matrix, alignment_matrix) #print result[0] #human #print result[1] #fly #print result[2] # Q2 seq_pax = read_protein(CONSENSUS_PAX_URL) #fly and pax domain # alignment_matrix_global = student.compute_alignment_matrix(result[2], seq_pax, score_matrix, True) # result2 = student.compute_global_alignment(result[2], seq_pax, score_matrix, alignment_matrix_global) #print result2[0] #print result2[1]
def align_human_fly_protein(): alignment_matrix = project4.compute_alignment_matrix(protein_human, protein_fly, scoring_matrix, False) result = project4.compute_local_alignment(protein_human, protein_fly, scoring_matrix, alignment_matrix) return result
# template lines and solution lines list of line string word_list = words.split('\n') print "Loaded a dictionary with", len(word_list), "words" return word_list # question 1 scoring_matrix = read_scoring_matrix(PAM50_URL) seq_x = read_protein(HUMAN_EYELESS_URL) seq_y = read_protein(FRUITFLY_EYELESS_URL) consensusseq = read_protein(CONSENSUS_PAX_URL) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False) score, string_Hu, string_Fr = student.compute_local_alignment( seq_x, seq_y, scoring_matrix, alignment_matrix) print string_Hu newstring_Hu = "" for elem in string_Hu: if elem != '-': newstring_Hu += elem print newstring_Hu newstring_Fr = "" for elem in string_Fr: if elem != '-': newstring_Fr += elem alignment_matrix_Hum_local_Con = student.compute_alignment_matrix( newstring_Hu, consensusseq, scoring_matrix, True) score1, str_Hu_Con, str_Con_Hu = student.compute_global_alignment(
def read_words(filename): """ Load word list from the file named filename. Returns a list of strings. """ # load assets word_file = urllib2.urlopen(filename) # read in files as string words = word_file.read() # template lines and solution lines list of line string word_list = words.split('\n') print "Loaded a dictionary with", len(word_list), "words" return word_list HUMAN_EYELESS_PROTEIN = read_protein(HUMAN_EYELESS_URL) FRUITFLY_EYELESS_PROTEIN = read_protein(FRUITFLY_EYELESS_URL) PAM50_SCORING_MATRIX = read_scoring_matrix(PAM50_URL) PAM50_ALIGNMENT_MATRIX = student.compute_alignment_matrix( HUMAN_EYELESS_PROTEIN, FRUITFLY_EYELESS_PROTEIN, PAM50_SCORING_MATRIX, False) print student.compute_local_alignment(HUMAN_EYELESS_PROTEIN, FRUITFLY_EYELESS_PROTEIN, PAM50_SCORING_MATRIX, PAM50_ALIGNMENT_MATRIX)
""" protein_file = urllib2.urlopen(filename) protein_seq = protein_file.read() protein_seq = protein_seq.rstrip() return protein_seq # Q1 #compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) seq_fly = read_protein(FRUITFLY_EYELESS_URL) seq_human = read_protein(HUMAN_EYELESS_URL) score_matrix = read_scoring_matrix(PAM50_URL) alignment_matrix = student.compute_alignment_matrix(seq_human, seq_fly, score_matrix, False) result = student.compute_local_alignment(seq_human, seq_fly, score_matrix, alignment_matrix) #print result[0] #human #print result[1] #fly #print result[2] # Q2 seq_pax = read_protein(CONSENSUS_PAX_URL) #fly and pax domain # alignment_matrix_global = student.compute_alignment_matrix(result[2], seq_pax, score_matrix, True) # result2 = student.compute_global_alignment(result[2], seq_pax, score_matrix, alignment_matrix_global) #print result2[0] #print result2[1] #print result2[2]