Exemple #1
0
def sorensen_plus(a: str, b: str) -> float:
    length = min(len(a), len(b))
    ng = [
        distance.sorensen(ngrams(a, n), ngrams(b, n))
        for n in range(1, length + 1)
    ]
    return 1 - np.sum(ng) / length
def p_spectrum(x, y, p: int = 2) -> float:
    """Hashmap algorithm for p-spectrum similarity."""
    ng_a = ngrams(x, p)
    ng_b = ngrams(y, p)

    x_count = Counter(ng_a)
    y_count = Counter(ng_b)

    return np.sum([x_count[k] * y_count[k] for k in x_count.keys()])
Exemple #3
0
def ngram_sim(x, y, n: int = 2) -> float:
    """Binary version of n-gram similarity."""
    ng_a = ngrams(x, n)
    ng_b = ngrams(y, n)
    x_len = len(ng_a)
    y_len = len(ng_b)

    np_mem = np.zeros([x_len + 1, y_len + 1], dtype=np.intc)
    mem_table = np_mem

    for i in range(1, x_len + 1):
        for j in range(1, y_len + 1):
            mem_table[i][j] = max(
                mem_table[i][j - 1], mem_table[i - 1][j],
                mem_table[i - 1][j - 1] + (ng_a[i - 1] == ng_b[j - 1]))

    return float(mem_table[x_len][y_len]) / float(max(x_len, y_len))
Exemple #4
0
               for s in file
           ][1:]

vocab = seqs[0]
for s in seqs:
    vocab |= s

vocab |= {" "}
aminoacid_dict = {}
for a in vocab:
    aminoacid_dict[a] = len(aminoacid_dict.keys())

trigram_dict = {}
for x in vocab:
    for y in vocab:
        for z in vocab:
            trigram_dict[x + y + z] = len(trigram_dict.keys())

with codecs.open(seq_path, "r") as f_in:
    seqs = [s[:-1] for s in f_in][1:]

v_size = len(trigram_dict.keys())

V = np.zeros([len(seqs), v_size], dtype=int)

for i in np.arange(V.shape[0]):
    for t in ngrams(seqs[i], 3):
        V[i, trigram_dict[t]] = 1

np.save("seqs_sparse_trigram_vectors.npy", V)
Exemple #5
0
def compute_similarity_matrix_ngram_sparse(
    *,
    repr_vocab,
    full_vocab,
    ngram_to_index,
    n: int,
    use_tqdm: bool = True,
):
    v_size = len(ngram_to_index)

    # R is the transposed matrix of the one-hot representations of the representative vocab
    R = dok_matrix((v_size, len(repr_vocab)), dtype=int)
    R_ng = np.zeros([len(repr_vocab)], dtype=int)

    it_1 = np.arange(R.shape[1])
    if use_tqdm:
        it_1 = tqdm(
            it_1,
            desc=
            f"{EMOJI} compute one hot representation of representative vocabulary"
        )
    for i in it_1:
        ng = ngrams(repr_vocab[i], int(n))
        R_ng[i] = len(ng)
        for j in range(len(ng)):
            R[ngram_to_index[ng[j]], i] = 1

    R.tocsr()

    V = dok_matrix((len(full_vocab), v_size), dtype=int)
    V_ng = np.zeros([len(full_vocab)], dtype=int)

    it_2 = np.arange(V.shape[0])
    if use_tqdm:
        it_2 = tqdm(
            it_2,
            desc=f"{EMOJI} compute one hot representation of full vocabulary")
    for i in it_2:
        ng = ngrams(full_vocab[i], int(n))
        V_ng[i] = len(ng)
        for j in range(len(ng)):
            try:
                V[i, ngram_to_index[ng[j]]] = 1
            except KeyError:
                pass
    V.tocsr()

    L = np.empty([len(V_ng), len(R_ng)], dtype=int)

    it_3 = range(len(V_ng))
    if use_tqdm:
        it_3 = tqdm(
            it_3,
            desc=
            f"{EMOJI} Compute normalization matrix with maximum number of n-grams for the proteins"
        )
    for i in it_3:
        for j in range(len(R_ng)):
            L[i, j] = max(V_ng[i], R_ng[j])

    return V.dot(R).toarray() / L
Exemple #6
0
def _get_ngram_elements_helper(strings, *, ngram_to_index, n: int, desc=None):
    return [[ngram_to_index[t] for t in ngrams(string, n)]
            for string in tqdm(strings, desc=desc)]