Example #1
0
    def test_2(self):
        from Bio.SubsMat.MatrixInfo import blosum62
        print blosum62

        columns = []
        points = []

        for tup in blosum62.keys():
            if not tup[0] in columns:
                columns.append(tup[0])

        print columns

        for x, first in enumerate(columns):
            points.append([])
            for second in columns:
                try:
                    points[x].append(blosum62[(first,second)])
                except KeyError:
                    points[x].append(blosum62[(second, first)])


        for row in points:
            print row

        with open("inputs/blosum.txt", "w") as file:
            file.write("\t".join(columns))
            file.write("\n")

            for row in points:
                file.write("\t".join(map(str, row)))
                file.write("\n")
def get_distance_matrix():
    aas = sorted({t[0] for t in sorted(blosum62.keys())})
    D = np.array([[
        blosum62[a, b] if (a, b) in blosum62 else blosum62[b, a] for a in aas
    ] for b in aas])
    D = -(D + min(D.flatten()) + 1)
    return D
Example #3
0
def sim(ch1, ch2):
    """ Similarity between two amino acids.
    :param ch1: string
    :param ch2: string
    :return: int
    """
    if (ch1, ch2) in blosum62.keys():
        return blosum62[(ch1, ch2)]
    else:
        return blosum62[(ch2, ch1)]
Example #4
0
def return_blosum62_dict():
    prot_dic = dict((k, 0) for k in IUPACData.protein_letters)
    A = []
    Dictblosum = {}
    for aa in prot_dic:
        for bb in prot_dic:
            if (aa, bb) in blosum62.keys():
                A.append(blosum62[(aa, bb)])
            else:
                A.append(blosum62[(bb, aa)])
        Dictblosum[aa] = A
        A = []
    Dictblosum['X'] = [
        0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2,
        -1, -1
    ]
    Dictblosum['B'] = [
        -2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3
    ]
    Dictblosum['Z'] = [
        -1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2
    ]
    return Dictblosum
Example #5
0
def embed(x_train, y_train, output_csv=None):
    max_len = max([len(s) for s in x_train])

    aas = sorted({t[0] for t in sorted(blosum62.keys())})
    D = get_distance_matrix()
    aa_to_embedding = {
        aa: embedding[0]
        for aa, embedding in zip(
            aas,
            MDS(n_components=1, dissimilarity='precomputed').fit_transform(D))
    }

    processed = np.zeros((len(x_train), max_len))
    for i, s in enumerate(x_train):
        for j, c in enumerate(s):
            processed[i, j] = aa_to_embedding[c]

    if output_csv:
        print('Saving embedded csv')
        pd.DataFrame(np.c_[processed, y_train]).to_csv(output_csv,
                                                       header=None,
                                                       index=False,
                                                       float_format='%.3f')
    return processed, y_train
Example #6
0
from Bio.SubsMat.MatrixInfo import blosum62

mmatch = {}
for i, j in blosum62.keys():
    mmatch[i, j] = mmatch[j, i] = blosum62[i, j]


def score(seq1, seq2, gap, match, d_old=None, depth=0):
    if not d_old:
        d = [[i*gap for i in range(len(seq1)+1)]]
    else:
        d = [d_old[-1]]
    j = depth
    for s in seq2:
        cur = [(j+1) * gap]
        for i, t in enumerate(seq1):
            cur.append(max(d[-1][i+1] + gap, d[-1][i] + match[s, t], cur[-1] + gap))
        d.append(cur)
        j += 1
    return d if not d_old else d[1:]


def backtrace(seq1, seq2, score_matrix, gap, match):
    i, j = len(seq2), len(seq1)

    res1, res2 = [], []
    while i != 0 or j != 0:
        s, t = seq2[i - 1], seq1[j - 1]
        l = [score_matrix[i - 1][j] + gap,
             score_matrix[i][j - 1] + gap,
             score_matrix[i - 1][j - 1] + match[s, t]]
def create_figures(feature_to_weighted_sums, weight_totals, min_total,
                   report_dir, filetype):
    aa_blosum = set()
    for aa1, aa2 in blosum62.keys():
        aa_blosum.add(aa1)
        aa_blosum.add(aa2)

    include_mask = weight_totals >= min_total

    p = re.compile(r'aa_to_([A-Z])$')
    aa_to_features = {}
    for feature_name, weighted_sums in feature_to_weighted_sums.items():
        m = p.match(feature_name)
        if m:
            aa = m[1]
            mean_by_heads = np.where(include_mask,
                                     weighted_sums / weight_totals, -1)
            feature_vector = mean_by_heads.flatten()
            feature_vector = feature_vector[feature_vector != -1]
            aa_to_features[aa] = feature_vector

    aas = sorted(aa_to_features.keys())
    aas_set = set(aas)
    print('Excluding following AAs not in feature set', aa_blosum - aas_set)
    print('Excluding following AAs not in blosum62', aas_set - aa_blosum)
    aa_list = sorted(list(aas_set & aa_blosum))
    n_aa = len(aa_list)
    corr = np.zeros((n_aa, n_aa))
    for i, aa1 in enumerate(aa_list):
        vector1 = aa_to_features[aa1]
        for j, aa2 in enumerate(aa_list):
            if i == j:
                corr[i, j] = None
            else:
                vector2 = aa_to_features[aa2]
                corr[i, j], _ = pearsonr(vector1, vector2)

    cmap = 'Blues'
    ax = sns.heatmap(corr, cmap=cmap, vmin=-0.5)
    ax.set_xticklabels(aa_list)
    ax.set_yticklabels(aa_list)
    plt.savefig(report_dir / f'aa_corr_to.pdf', format=filetype)
    plt.close()

    blosum = np.zeros((n_aa, n_aa))
    for i, aa1 in enumerate(aa_list):
        for j, aa2 in enumerate(aa_list):
            if i == j:
                blosum[i, j] = None
            else:
                if blosum62.get((aa1, aa2)) is not None:
                    blosum[i, j] = blosum62.get((aa1, aa2))
                else:
                    blosum[i, j] = blosum62.get((aa2, aa1))

    ax = sns.heatmap(blosum, cmap=cmap, vmin=-4, vmax=4)
    ax.set_xticklabels(aa_list)
    ax.set_yticklabels(aa_list)
    plt.savefig(report_dir / f'blosum62.pdf', format=filetype)
    plt.close()

    corr_scores = []
    blos_scores = []
    for i in range(n_aa):
        for j in range(i):
            corr_scores.append(corr[i, j])
            blos_scores.append(blosum[i, j])
    print('Pearson Correlation between feature corr and blosum',
          pearsonr(corr_scores, blos_scores)[0])
Example #8
0
    if output_csv:
        print('Saving embedded csv')
        pd.DataFrame(np.c_[processed, y_train]).to_csv(output_csv,
                                                       header=None,
                                                       index=False,
                                                       float_format='%.3f')
    return processed, y_train


if __name__ == '__main__':
    # x_train, y_train = np.hsplit(pd.read_csv('train.csv', delimiter=',', header=None).to_numpy(), 2)
    # x_train, y_train = x_train[:, 0], y_train[:, 0]
    # # x_train, y_train = x_train[:, 0], y_train[:, 0]
    # x_train = embed(x_train, y_train, 'embedded.csv')
    k = 3
    aas = sorted({t[0] for t in sorted(blosum62.keys())})
    kmer_to_index = {
        ''.join(kmer): i + 1
        for i, kmer in enumerate(product(aas, repeat=k))
    }
    print(kmer_to_index)
    print(len(kmer_to_index))
    # print(list(product(aas, repeat=3)))
    # product([1,2,3], repeat=3)
    # print([1, 2, 3, 4, 5][:-3 + 1])

    data = pd.read_csv('train.csv', delimiter=',', header=None).to_numpy()
    #
    x_train, y_train = data[:, 0], data[:, 1]
    print(x_train.shape, y_train.shape)
from Bio.SubsMat.MatrixInfo import blosum62

mmatch = {}
for i, j in blosum62.keys():
    mmatch[i, j] = mmatch[j, i] = blosum62[i, j]


def score(seq1, seq2, gap, match, d_old=None, depth=0):
    if not d_old:
        d = [[i * gap for i in range(len(seq1) + 1)]]
    else:
        d = [d_old[-1]]
    j = depth
    for s in seq2:
        cur = [(j + 1) * gap]
        for i, t in enumerate(seq1):
            cur.append(
                max(d[-1][i + 1] + gap, d[-1][i] + match[s, t], cur[-1] + gap))
        d.append(cur)
        j += 1
    return d if not d_old else d[1:]


def backtrace(seq1, seq2, score_matrix, gap, match):
    i, j = len(seq2), len(seq1)

    res1, res2 = [], []
    while i != 0 or j != 0:
        s, t = seq2[i - 1], seq1[j - 1]
        l = [
            score_matrix[i - 1][j] + gap, score_matrix[i][j - 1] + gap,