def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    ##### YOUR CODE HERE
    df_ngrams = vsm.ngram_vsm(df, n)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    ##### YOUR CODE HERE
    new_matrix = []
    for i in df.index:
        new_vec = vsm.character_level_rep(i, df_ngrams, n)
        new_matrix.append(new_vec)

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    ##### YOUR CODE HERE
    df_sub = pd.DataFrame(new_matrix, df.index)
    df_sub_sum = df.add(df_sub)

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    ##### YOUR CODE HERE
    return df_sub_sum
Exemple #2
0
def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    ##### YOUR CODE HERE
    ngram_df = vsm.ngram_vsm(df, n)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    ##### YOUR CODE HERE
    reps = np.stack(
        df.index.map(lambda w: vsm.character_level_rep(w, ngram_df, n)))

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    ##### YOUR CODE HERE
    result = df + reps

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    ##### YOUR CODE HERE
    return result
Exemple #3
0
def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    vsm_char = vsm.ngram_vsm(df, n=n)
    print(vsm_char)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    result = {}
    new_df = pd.DataFrame(columns=df.index)
    for word in df.index:
        result[word] = np.add(vsm.character_level_rep(word, vsm_char, n=n),
                              np.array(df.loc[word]))

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    new_df = pd.DataFrame.from_dict(result, orient='index')

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    return new_df
Exemple #4
0
def subword_enrichment(df, n=4):

    # 1. Use `vsm.ngram_vsm` to create a character-level
    # VSM from `df`, using the above parameter `n` to
    # set the size of the ngrams.

    ##### YOUR CODE HERE
    df_ngrams = vsm.ngram_vsm(df, n=n)
    print(df_ngrams)

    # 2. Use `vsm.character_level_rep` to get the representation
    # for every word in `df` according to the character-level
    # VSM you created above.

    ##### YOUR CODE HERE
    char_reps = {}
    for word in df.index:
        char_reps[word] = vsm.character_level_rep(word, df_ngrams, n=n)

    # 3. For each representation created at step 2, add in its
    # original representation from `df`. (This should use
    # element-wise addition; the dimensionality of the vectors
    # will be unchanged.)

    ##### YOUR CODE HERE
    for word in df.index:
        char_reps[word] += df.loc[word].values
        print(char_reps[word])

    # 4. Return a `pd.DataFrame` with the same index and column
    # values as `df`, but filled with the new representations
    # created at step 3.

    ##### YOUR CODE HERE
    ret_df = df.copy()
    for word in df.index:
        ret_df.loc[word] = char_reps[word]

    return ret_df
Exemple #5
0
def test_ngram_vsm(df, bigram, expected):
    X = vsm.ngram_vsm(df)
    result = X.loc[bigram]
    assert np.array_equal(result, expected)
# There is an implementation of TF-IDF for dense matrices in `vsm.tfidf`.
# 
# __Important__: `sklearn`'s version, [TfidfTransformer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer), assumes that term frequency (TF) is defined row-wise and document frequency is defined column-wise. That is, it assumes `sklearn`'s document $\times$ word basic design, which makes sense for classification tasks, where the design is example $\times$ features. This is the transpose of the way we've been thinking.

# ## Subword information
# 
# [Schütze (1993)](https://papers.nips.cc/paper/603-word-space) pioneered the use of subword information to improve representations by reducing sparsity, thereby increasing the density of connections in a VSM. In recent years, this idea has shown value in numerous contexts. 
# 
# [Bojanowski et al. (2016)](https://arxiv.org/abs/1607.04606) (the [fastText](https://fasttext.cc) team) explore a particularly straightforward approach to doing this: represent each word as the sum of the representations for the character-level n-grams it contains.
# 
# It is simple to derive character-level n-gram representations from our existing VSMs. The function `vsm.ngram_vsm` implements the basic step. Here, we create the 4-gram version of `imdb5`:

# In[37]:


imdb5_ngrams = vsm.ngram_vsm(imdb5, n=4)


# In[38]:


imdb5_ngrams.shape


# This has the same column dimension as the `imdb5`, but the rows are expanded with all the 4-grams, including boundary symbols `<w>` and `</w>`. 
# 
# `vsm.character_level_rep` is a simple function for creating new word representations from the associated character-level ones. Many variations on that function are worth trying – for example, you could include the original word vector where available, change the aggregation method from `sum` to something else, use a real morphological parser instead of just n-grams, and so on.

# One very powerful thing about this is that we can represent words that are not even in the original VSM:

# In[39]:
Exemple #7
0
import numpy as np
import pandas as pd
from mittens import GloVe
import vsm
import sst

sst.build_dataset()

vsm.ngram_vsm()

pd.DataFrame


def dice_distance(u, v):
    np.min(u, v)
    np.hstack(u, v)