def test_2(self): from Bio.SubsMat.MatrixInfo import blosum62 print blosum62 columns = [] points = [] for tup in blosum62.keys(): if not tup[0] in columns: columns.append(tup[0]) print columns for x, first in enumerate(columns): points.append([]) for second in columns: try: points[x].append(blosum62[(first,second)]) except KeyError: points[x].append(blosum62[(second, first)]) for row in points: print row with open("inputs/blosum.txt", "w") as file: file.write("\t".join(columns)) file.write("\n") for row in points: file.write("\t".join(map(str, row))) file.write("\n")
def get_distance_matrix(): aas = sorted({t[0] for t in sorted(blosum62.keys())}) D = np.array([[ blosum62[a, b] if (a, b) in blosum62 else blosum62[b, a] for a in aas ] for b in aas]) D = -(D + min(D.flatten()) + 1) return D
def sim(ch1, ch2): """ Similarity between two amino acids. :param ch1: string :param ch2: string :return: int """ if (ch1, ch2) in blosum62.keys(): return blosum62[(ch1, ch2)] else: return blosum62[(ch2, ch1)]
def return_blosum62_dict(): prot_dic = dict((k, 0) for k in IUPACData.protein_letters) A = [] Dictblosum = {} for aa in prot_dic: for bb in prot_dic: if (aa, bb) in blosum62.keys(): A.append(blosum62[(aa, bb)]) else: A.append(blosum62[(bb, aa)]) Dictblosum[aa] = A A = [] Dictblosum['X'] = [ 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1 ] Dictblosum['B'] = [ -2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3 ] Dictblosum['Z'] = [ -1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2 ] return Dictblosum
def embed(x_train, y_train, output_csv=None): max_len = max([len(s) for s in x_train]) aas = sorted({t[0] for t in sorted(blosum62.keys())}) D = get_distance_matrix() aa_to_embedding = { aa: embedding[0] for aa, embedding in zip( aas, MDS(n_components=1, dissimilarity='precomputed').fit_transform(D)) } processed = np.zeros((len(x_train), max_len)) for i, s in enumerate(x_train): for j, c in enumerate(s): processed[i, j] = aa_to_embedding[c] if output_csv: print('Saving embedded csv') pd.DataFrame(np.c_[processed, y_train]).to_csv(output_csv, header=None, index=False, float_format='%.3f') return processed, y_train
from Bio.SubsMat.MatrixInfo import blosum62 mmatch = {} for i, j in blosum62.keys(): mmatch[i, j] = mmatch[j, i] = blosum62[i, j] def score(seq1, seq2, gap, match, d_old=None, depth=0): if not d_old: d = [[i*gap for i in range(len(seq1)+1)]] else: d = [d_old[-1]] j = depth for s in seq2: cur = [(j+1) * gap] for i, t in enumerate(seq1): cur.append(max(d[-1][i+1] + gap, d[-1][i] + match[s, t], cur[-1] + gap)) d.append(cur) j += 1 return d if not d_old else d[1:] def backtrace(seq1, seq2, score_matrix, gap, match): i, j = len(seq2), len(seq1) res1, res2 = [], [] while i != 0 or j != 0: s, t = seq2[i - 1], seq1[j - 1] l = [score_matrix[i - 1][j] + gap, score_matrix[i][j - 1] + gap, score_matrix[i - 1][j - 1] + match[s, t]]
def create_figures(feature_to_weighted_sums, weight_totals, min_total, report_dir, filetype): aa_blosum = set() for aa1, aa2 in blosum62.keys(): aa_blosum.add(aa1) aa_blosum.add(aa2) include_mask = weight_totals >= min_total p = re.compile(r'aa_to_([A-Z])$') aa_to_features = {} for feature_name, weighted_sums in feature_to_weighted_sums.items(): m = p.match(feature_name) if m: aa = m[1] mean_by_heads = np.where(include_mask, weighted_sums / weight_totals, -1) feature_vector = mean_by_heads.flatten() feature_vector = feature_vector[feature_vector != -1] aa_to_features[aa] = feature_vector aas = sorted(aa_to_features.keys()) aas_set = set(aas) print('Excluding following AAs not in feature set', aa_blosum - aas_set) print('Excluding following AAs not in blosum62', aas_set - aa_blosum) aa_list = sorted(list(aas_set & aa_blosum)) n_aa = len(aa_list) corr = np.zeros((n_aa, n_aa)) for i, aa1 in enumerate(aa_list): vector1 = aa_to_features[aa1] for j, aa2 in enumerate(aa_list): if i == j: corr[i, j] = None else: vector2 = aa_to_features[aa2] corr[i, j], _ = pearsonr(vector1, vector2) cmap = 'Blues' ax = sns.heatmap(corr, cmap=cmap, vmin=-0.5) ax.set_xticklabels(aa_list) ax.set_yticklabels(aa_list) plt.savefig(report_dir / f'aa_corr_to.pdf', format=filetype) plt.close() blosum = np.zeros((n_aa, n_aa)) for i, aa1 in enumerate(aa_list): for j, aa2 in enumerate(aa_list): if i == j: blosum[i, j] = None else: if blosum62.get((aa1, aa2)) is not None: blosum[i, j] = blosum62.get((aa1, aa2)) else: blosum[i, j] = blosum62.get((aa2, aa1)) ax = sns.heatmap(blosum, cmap=cmap, vmin=-4, vmax=4) ax.set_xticklabels(aa_list) ax.set_yticklabels(aa_list) plt.savefig(report_dir / f'blosum62.pdf', format=filetype) plt.close() corr_scores = [] blos_scores = [] for i in range(n_aa): for j in range(i): corr_scores.append(corr[i, j]) blos_scores.append(blosum[i, j]) print('Pearson Correlation between feature corr and blosum', pearsonr(corr_scores, blos_scores)[0])
if output_csv: print('Saving embedded csv') pd.DataFrame(np.c_[processed, y_train]).to_csv(output_csv, header=None, index=False, float_format='%.3f') return processed, y_train if __name__ == '__main__': # x_train, y_train = np.hsplit(pd.read_csv('train.csv', delimiter=',', header=None).to_numpy(), 2) # x_train, y_train = x_train[:, 0], y_train[:, 0] # # x_train, y_train = x_train[:, 0], y_train[:, 0] # x_train = embed(x_train, y_train, 'embedded.csv') k = 3 aas = sorted({t[0] for t in sorted(blosum62.keys())}) kmer_to_index = { ''.join(kmer): i + 1 for i, kmer in enumerate(product(aas, repeat=k)) } print(kmer_to_index) print(len(kmer_to_index)) # print(list(product(aas, repeat=3))) # product([1,2,3], repeat=3) # print([1, 2, 3, 4, 5][:-3 + 1]) data = pd.read_csv('train.csv', delimiter=',', header=None).to_numpy() # x_train, y_train = data[:, 0], data[:, 1] print(x_train.shape, y_train.shape)
from Bio.SubsMat.MatrixInfo import blosum62 mmatch = {} for i, j in blosum62.keys(): mmatch[i, j] = mmatch[j, i] = blosum62[i, j] def score(seq1, seq2, gap, match, d_old=None, depth=0): if not d_old: d = [[i * gap for i in range(len(seq1) + 1)]] else: d = [d_old[-1]] j = depth for s in seq2: cur = [(j + 1) * gap] for i, t in enumerate(seq1): cur.append( max(d[-1][i + 1] + gap, d[-1][i] + match[s, t], cur[-1] + gap)) d.append(cur) j += 1 return d if not d_old else d[1:] def backtrace(seq1, seq2, score_matrix, gap, match): i, j = len(seq2), len(seq1) res1, res2 = [], [] while i != 0 or j != 0: s, t = seq2[i - 1], seq1[j - 1] l = [ score_matrix[i - 1][j] + gap, score_matrix[i][j - 1] + gap,