def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. ##### YOUR CODE HERE df_ngrams = vsm.ngram_vsm(df, n) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. ##### YOUR CODE HERE new_matrix = [] for i in df.index: new_vec = vsm.character_level_rep(i, df_ngrams, n) new_matrix.append(new_vec) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) ##### YOUR CODE HERE df_sub = pd.DataFrame(new_matrix, df.index) df_sub_sum = df.add(df_sub) # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. ##### YOUR CODE HERE return df_sub_sum
def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. ##### YOUR CODE HERE ngram_df = vsm.ngram_vsm(df, n) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. ##### YOUR CODE HERE reps = np.stack( df.index.map(lambda w: vsm.character_level_rep(w, ngram_df, n))) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) ##### YOUR CODE HERE result = df + reps # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. ##### YOUR CODE HERE return result
def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. vsm_char = vsm.ngram_vsm(df, n=n) print(vsm_char) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. result = {} new_df = pd.DataFrame(columns=df.index) for word in df.index: result[word] = np.add(vsm.character_level_rep(word, vsm_char, n=n), np.array(df.loc[word])) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) new_df = pd.DataFrame.from_dict(result, orient='index') # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. return new_df
def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. ##### YOUR CODE HERE df_ngrams = vsm.ngram_vsm(df, n=n) print(df_ngrams) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. ##### YOUR CODE HERE char_reps = {} for word in df.index: char_reps[word] = vsm.character_level_rep(word, df_ngrams, n=n) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) ##### YOUR CODE HERE for word in df.index: char_reps[word] += df.loc[word].values print(char_reps[word]) # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. ##### YOUR CODE HERE ret_df = df.copy() for word in df.index: ret_df.loc[word] = char_reps[word] return ret_df
def test_ngram_vsm(df, bigram, expected): X = vsm.ngram_vsm(df) result = X.loc[bigram] assert np.array_equal(result, expected)
# There is an implementation of TF-IDF for dense matrices in `vsm.tfidf`. # # __Important__: `sklearn`'s version, [TfidfTransformer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer), assumes that term frequency (TF) is defined row-wise and document frequency is defined column-wise. That is, it assumes `sklearn`'s document $\times$ word basic design, which makes sense for classification tasks, where the design is example $\times$ features. This is the transpose of the way we've been thinking. # ## Subword information # # [Schütze (1993)](https://papers.nips.cc/paper/603-word-space) pioneered the use of subword information to improve representations by reducing sparsity, thereby increasing the density of connections in a VSM. In recent years, this idea has shown value in numerous contexts. # # [Bojanowski et al. (2016)](https://arxiv.org/abs/1607.04606) (the [fastText](https://fasttext.cc) team) explore a particularly straightforward approach to doing this: represent each word as the sum of the representations for the character-level n-grams it contains. # # It is simple to derive character-level n-gram representations from our existing VSMs. The function `vsm.ngram_vsm` implements the basic step. Here, we create the 4-gram version of `imdb5`: # In[37]: imdb5_ngrams = vsm.ngram_vsm(imdb5, n=4) # In[38]: imdb5_ngrams.shape # This has the same column dimension as the `imdb5`, but the rows are expanded with all the 4-grams, including boundary symbols `<w>` and `</w>`. # # `vsm.character_level_rep` is a simple function for creating new word representations from the associated character-level ones. Many variations on that function are worth trying – for example, you could include the original word vector where available, change the aggregation method from `sum` to something else, use a real morphological parser instead of just n-grams, and so on. # One very powerful thing about this is that we can represent words that are not even in the original VSM: # In[39]:
import numpy as np import pandas as pd from mittens import GloVe import vsm import sst sst.build_dataset() vsm.ngram_vsm() pd.DataFrame def dice_distance(u, v): np.min(u, v) np.hstack(u, v)