def do(sequences, query_seq, position, pattern, changed_pattern, SW=0): '''Finds the difference in the entropy in patterns when a pattern is changed to changed pattern. ''' # case 1: for original amino acid sequences.append(query_seq) patterns = check_mutation_position.do(sequences, position) if SW: sw = landgraf_sequence_weights.do(sequences) else: sw = list(np.ones(len(sequences))) entropy1 = calculate_sop(patterns, sw) # case 2: for mutant amino acid sequences.pop() new_query_seq = util.strsub(query_seq, position, changed_pattern) sequences.append(new_query_seq) patterns = check_mutation_position.do(sequences, position) if SW: sw = landgraf_sequence_weights.do(sequences) else: sw = list(np.ones(len(sequences))) entropy2 = calculate_sop(patterns, sw) return (entropy2 - entropy1)
def main(): filename = '../alignments/Q92481' position = 72 changed_pattern = 'R' import load_alignments import uniquify_alignments import check_mutation_position import correct_alignment_position proteins = load_alignments.do(filename) sorted_list = sorted(proteins, key=itemgetter('match_percentage')) proteins = sorted_list proteins.reverse() proteins = uniquify_alignments.do(proteins) query, prots = util.fetch_query_protein_in_alignments(proteins) mod_position = correct_alignment_position.do(query['alignment'], position) pattern = query['alignment'][mod_position] patterns = check_mutation_position.do( [prot['alignment'] for prot in proteins], mod_position) print(do(patterns, pattern, changed_pattern))
def do(sequences): L = len(sequences[0]) # length of aligned sequences N = len(sequences) # number of sequences weights = [] for i in range(L): aa = check_mutation_position.do(sequences, i) freq = util.calc_frequency(aa) uniq_aa = util.unique(aa) # compute sequence weights w = [] for i in range(N): if aa[i] == '-' or aa[i] == 'X': w.append(0) else: if aa[i] == 'B': aa[i] = 'N' elif aa[i] == 'Z': aa[i] = 'Q' w.append(1.0 / (len(uniq_aa) * freq[aa[i]])) weights.append(w) # N x L matrix, for each position find the weights # compute average of w over all positions avg_weight = np.zeros(N) for i in range(L): avg_weight += np.array(weights[i]) avg_weight *= 1.0 / L return avg_weight
if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # fetch the corresponding sequences sequences = [a['alignment'] for a in alignments] # compute frequency of amino acid at desired position aa = check_mutation_position.do([k['alignment'] for k in alignments], actual_pos) # compute mean value in the column mean = {} cnt = 0 for key in values.keys(): # for each physco property first = 1 for z in aa: if first: if z != '-': mean[key] = values[key][z] cnt += 1 first = 0 else: continue else:
def work(line): '''This method computes all kinds of scores, for a given mutation, and returns the computed scores. Args: line: which contains mutation info. <PROTEIN_ID, POSITION, ORIGINAL_AMINO_ACID, MUTANT_AMINO_ACID> Return Values: [parts, scores] flattend list ''' line = line.strip() parts = line.split(',') protein = parts[0] position = int(parts[1]) - 1 # correction for indexing python lists mut_aa = parts[3] result = [] print(protein) for alignment_dir in align_dirs: alignment_file = os.path.join(alignment_dir, protein) if not os.path.isfile(alignment_file): return [] fasta = util.read_sequence('../fasta/%s.fasta' % protein) proteins = load_alignments.do( alignment_file) # types of proteins required # sort & prune the list of proteins proteins = prune_proteins_list(proteins) # fetch the record pertaining to the query protein query, p = util.fetch_query_protein_in_alignments( proteins) # p does not contain query sequence prots = [prot["id"] for prot in proteins] types = [prot["type"] for prot in proteins] match_percents = [prot["match_percentage"] for prot in proteins] alignments = [prot["alignment"] for prot in proteins] proteins = prots #replacing proteins dicitonary with only ids result.append(len(prots) + 1) orig_aa = fasta[position] # correct the position of query protein wrt to alignment mod_pos = correct_alignment_position.do(query['alignment'], position) aa = check_mutation_position.do(alignments, mod_pos) # calculate shannon entropy score w/o sequence weights result.append( shannon_entropy_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa)) # shannon entropy score with sequence weights result.append( shannon_entropy_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa, 1)) # calculate von-neumann entropy score result.append(von_neumann_entropy_score.do(list(aa), orig_aa, mut_aa)) # calculate relative entropy score #result.append(relative_entropy_score.do(list(aa), orig_aa, mut_aa)) # calculate jensen-shannon divergence score #result.append(jensen_shannon_divergence_score(list(aa), orig_aa, mut_aa)) # calculate sum-of-pairs scores result.append( sum_of_pairs_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa, 0)) # wo seq wg result.append( sum_of_pairs_score.do(list(alignments), query['alignment'], mod_pos, orig_aa, mut_aa, 1)) # w seq wg # return mutation information and scores for recording in a file return [str(item) for item in list(chain.from_iterable([parts, result]))]
def do(file1, file2): # fetch all mutations mutations = [] with open(file1) as fp: for line in fp: parts = line.strip().split(',') mutations.append( [parts[0], parts[1], parts[2], parts[3], parts[4]]) ofp = open(file2, 'w') for mut in mutations: # fetch reqd info protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s' % protein if not os.path.isfile(alignment_file): continue print(mut) # load the aligned sequences alignments = load_alignments.do(alignment_file) # prune the proteins alignments = util.prune_proteins_list(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # fetch the corresponding sequences sequences = [a['alignment'] for a in alignments] # compute frequency of amino acid at desired position aa = check_mutation_position.do([k['alignment'] for k in alignments], actual_pos) # compute simple frequency of original & mutant amino acid o_score, m_score = simple_frequency_score.do(list(aa), orig_aa, mut_aa) # compute score using pseudo-counts in order to account for missing aa o_ps_score, m_ps_score = pseudo_count_score.do(list(aa), orig_aa, mut_aa) # compute simple sequence-weighted frequency score sequence_weights = landgraf_sequence_weights.do( [a['alignment'] for a in alignments]) o_sw_score, m_sw_score = simple_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # using gap frequencies o_gf_score, m_gf_score = gapped_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # calculate shannon entropy score w/o sequence weights shannon = shannon_entropy_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa) # shannon entropy score with sequence weights shannon_weighted = shannon_entropy_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 1) # calculate von-neumann entropy score von_neumann_score = von_neumann_entropy_score.do( list(aa), orig_aa, mut_aa) # calculate sum-of-pairs scores sop = sum_of_pairs_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 0) # wo seq wg sop_wg = sum_of_pairs_score.do(list(sequences), query['alignment'], actual_pos, orig_aa, mut_aa, 1) # w seq wg # append all scores together and write to file scores = [ o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score, shannon, shannon_weighted, von_neumann_score, sop, sop_wg ] #print(scores) ofp.write(','.join( [str(item) for item in list(chain.from_iterable([mut, scores]))]) + '\n') #break ofp.close()
def do(file1, file2, winsize): mutations = [] with open(file1) as fp: for line in fp: mutations.append(line.strip().split(',')) with open(file2, 'w') as ofp: for mut in mutations: print mut flag = 0 protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] fasta_seq = util.read_sequence('../fasta/%s.fasta' % protein) alfile = '../alignments/%s' % protein if not os.path.isfile(alfile): continue proteins = load_alignments.do(alfile) proteins = util.prune_proteins_list(proteins) query, proteins = util.fetch_query_protein_in_alignments(proteins) query_seq = query['alignment'] alignments = [a['alignment'] for a in proteins] try: sequence_weights = landgraf_sequence_weights.do(alignments) except Exception as e: print str(e) flag = 1 if flag: continue scores = [] for w in range(winsize + 1): # 0,1,2,3 for winsize=3, hence the +1 try: # what score to use? if w == 0: mod_pos = correct_alignment_position.do( query_seq, position) aa = check_mutation_position.do(alignments, mod_pos) o, m = gapped_frequency_score.do( list(aa), orig_aa, mut_aa, sequence_weights) # mod_pos scores.append(m) else: mod_pos = correct_alignment_position.do( query_seq, position - w) aa = check_mutation_position.do(alignments, mod_pos) aa_in_fasta = fasta_seq[position - w] o, m = gapped_frequency_score.do( list(aa), aa_in_fasta, mut_aa, sequence_weights) # mod_pos scores.append( o ) # left neighbour at position w from mutation position mod_pos = correct_alignment_position.do( query_seq, position + w) aa = check_mutation_position.do(alignments, mod_pos) aa_in_fasta = fasta_seq[position + w] o, m = gapped_frequency_score.do( list(aa), aa_in_fasta, mut_aa, sequence_weights) # mod_pos scores.append( o ) # right neighbour at position w from mutation position except Exception: flag = 1 break if flag: continue #print(scores) ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ]) + '\n')
def do(file1, file2): # fetch all mutations mutations = [] with open(file1) as fp: for line in fp: parts = line.strip().split(',') mutations.append( [parts[0], parts[1], parts[2], parts[3], parts[4]]) # ofp = open(file2,'w') for mut in mutations: # fetch reqd info protein = mut[0] position = int(mut[1]) - 1 orig_aa = mut[2] mut_aa = mut[3] # if protein not aligned - pass this alignment_file = '../alignments/%s' % protein if not os.path.isfile(alignment_file): continue print(mut) # load the aligned sequences alignments = load_alignments.do(alignment_file) # remove duplicate sequences in aligned sequences alignments = uniquify_alignments.do(alignments) # fetch query sequence query, alignments = util.fetch_query_protein_in_alignments(alignments) if len(alignments) == 0: continue # fetch actual position of mutation in aligned query sequence try: actual_pos = correct_alignment_position.do(query['alignment'], position) except Exception as e: print(query['alignment']) continue # compute frequency of amino acid at desired position aa = check_mutation_position.do([k['alignment'] for k in alignments], actual_pos) # compute simple frequency of original & mutant amino acid o_score, m_score = simple_frequency_score.do(aa, orig_aa, mut_aa) # compute score using pseudo-counts in order to account for missing aa o_ps_score, m_ps_score = pseudo_count_score.do(aa, orig_aa, mut_aa) # compute simple sequence-weighted frequency score sequence_weights = landgraf_sequence_weights.do( [a['alignment'] for a in alignments]) o_sw_score, m_sw_score = simple_frequency_score.do( aa, orig_aa, mut_aa, sequence_weights) # using gap frequencies o_gf_score, m_gf_score = gapped_frequency_score.do( aa, orig_aa, mut_aa, sequence_weights) # append all scores together and write to file scores = [ o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score, o_gf_score, m_gf_score ] print(scores) # ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n') break