def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. ##### YOUR CODE HERE ngram_df = vsm.ngram_vsm(df, n) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. ##### YOUR CODE HERE reps = np.stack( df.index.map(lambda w: vsm.character_level_rep(w, ngram_df, n))) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) ##### YOUR CODE HERE result = df + reps # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. ##### YOUR CODE HERE return result
def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. ##### YOUR CODE HERE df_ngrams = vsm.ngram_vsm(df, n) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. ##### YOUR CODE HERE new_matrix = [] for i in df.index: new_vec = vsm.character_level_rep(i, df_ngrams, n) new_matrix.append(new_vec) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) ##### YOUR CODE HERE df_sub = pd.DataFrame(new_matrix, df.index) df_sub_sum = df.add(df_sub) # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. ##### YOUR CODE HERE return df_sub_sum
def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. vsm_char = vsm.ngram_vsm(df, n=n) print(vsm_char) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. result = {} new_df = pd.DataFrame(columns=df.index) for word in df.index: result[word] = np.add(vsm.character_level_rep(word, vsm_char, n=n), np.array(df.loc[word])) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) new_df = pd.DataFrame.from_dict(result, orient='index') # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. return new_df
def subword_enrichment(df, n=4): # 1. Use `vsm.ngram_vsm` to create a character-level # VSM from `df`, using the above parameter `n` to # set the size of the ngrams. ##### YOUR CODE HERE df_ngrams = vsm.ngram_vsm(df, n=n) print(df_ngrams) # 2. Use `vsm.character_level_rep` to get the representation # for every word in `df` according to the character-level # VSM you created above. ##### YOUR CODE HERE char_reps = {} for word in df.index: char_reps[word] = vsm.character_level_rep(word, df_ngrams, n=n) # 3. For each representation created at step 2, add in its # original representation from `df`. (This should use # element-wise addition; the dimensionality of the vectors # will be unchanged.) ##### YOUR CODE HERE for word in df.index: char_reps[word] += df.loc[word].values print(char_reps[word]) # 4. Return a `pd.DataFrame` with the same index and column # values as `df`, but filled with the new representations # created at step 3. ##### YOUR CODE HERE ret_df = df.copy() for word in df.index: ret_df.loc[word] = char_reps[word] return ret_df
# This has the same column dimension as the `imdb5`, but the rows are expanded with all the 4-grams, including boundary symbols `<w>` and `</w>`. # # `vsm.character_level_rep` is a simple function for creating new word representations from the associated character-level ones. Many variations on that function are worth trying – for example, you could include the original word vector where available, change the aggregation method from `sum` to something else, use a real morphological parser instead of just n-grams, and so on. # One very powerful thing about this is that we can represent words that are not even in the original VSM: # In[39]: 'superbly' in imdb5.index # In[40]: superbly = vsm.character_level_rep("superbly", imdb5_ngrams) # In[41]: superb = vsm.character_level_rep("superb", imdb5_ngrams) # In[42]: vsm.cosine(superb, superbly) # ## Visualization