def run_ppmi_lsa_pipeline(count_df, k): ##### YOUR CODE HERE count_df_pmi = vsm.pmi(count_df) count_df_pmi_lsa = vsm.lsa(count_df_pmi, k) eval_results = full_word_similarity_evaluation(count_df_pmi_lsa) return eval_results
def run_ppmi_lsa_pipeline(count_df, k): ##### YOUR CODE HERE counts_ppmi = vsm.pmi(count_df, positive=True) counts_ppmi_lsa = vsm.lsa(counts_ppmi, k=k) results = full_word_similarity_evaluation(counts_ppmi_lsa) display(results) return results
def run_original_system(count_df, k=10): wn_edges = get_wordnet_edges() wn_index_edges = convert_edges_to_indices(wn_edges, count_df.fillna(0)) import IPython IPython.embed() ppmi_df = vsm.pmi(count_df) ppmi_df_lsa100 = vsm.lsa(ppmi_df, k) return full_word_similarity_evaluation(ppmi_df_lsa100)
def run_ppmi_lsa_pipeline(count_df, k): #part2 count_ppmi = vsm.pmi(count_df) #part3 count_ppmi_lsa = vsm.lsa(count_ppmi, k) print(count_ppmi_lsa) #part4 output = full_word_similarity_evaluation(count_ppmi_lsa) print(output) return output
if 'IS_GRADESCOPE_ENV' not in os.environ: giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'), index_col=0) giga20_ppmi = vsm.pmi(giga20, positive=True) print("giga20_ppmi") display(full_word_similarity_evaluation(giga20_ppmi)) # ### PPMI + LSA # %% if 'IS_GRADESCOPE_ENV' not in os.environ: print("giga20_ppmi_lsa") for k in (5, 10, 20, 50, 100): giga20_ppmi_lsa = vsm.lsa(giga20_ppmi, k=k) print("========", k, "========") display(full_word_similarity_evaluation(giga20_ppmi_lsa)) # %% if 'IS_GRADESCOPE_ENV' not in os.environ: print("giga20_ppmi_lsa") for k in (200, 500, 1000): giga20_ppmi_lsa = vsm.lsa(giga20_ppmi, k=k) print("========", k, "========") display(full_word_similarity_evaluation(giga20_ppmi_lsa)) # %% if 'IS_GRADESCOPE_ENV' not in os.environ:
def test_lsa(df): vsm.lsa(df, k=2)
# ### IMDB representations # # Our IMDB VSMs seems pretty well-attuned to the Stanford Sentiment Treebank, so we might think that they can do even better than the general-purpose GloVe inputs. Here are two quick assessments of that idea: # In[10]: imdb20 = pd.read_csv(os.path.join(VSMDATA_HOME, 'imdb_window20-flat.csv.gz'), index_col=0) # In[11]: imdb20_ppmi = vsm.pmi(imdb20, positive=False) # In[12]: imdb20_ppmi_svd = vsm.lsa(imdb20_ppmi, k=50) # In[13]: imdb_lookup = dict(zip(imdb20_ppmi_svd.index, imdb20_ppmi_svd.values)) # In[14]: def imdb_phi(tree, np_func=np.sum): return vsm_leaves_phi(tree, imdb_lookup, np_func=np_func) # In[15]: _ = sst.experiment(
# In[10]: vsm.neighbors('gnarly', gnarly_df) # Reweighting doesn't help. For example, here is the attempt with Positive PMI: # In[11]: vsm.neighbors('gnarly', vsm.pmi(gnarly_df)) # However, both words tend to occur with _awesome_ and not with _lame_ or _terrible_, so there is an important sense in which they are similar. LSA to the rescue: # In[12]: gnarly_lsa_df = vsm.lsa(gnarly_df, k=2) # In[13]: vsm.neighbors('gnarly', gnarly_lsa_df) # ### Applying LSA to real VSMs # # Here's an example that begins to convey the effect that this can have empirically. # # First, the original count matrix: # In[14]: vsm.neighbors('superb', imdb5).head()
def run_ppmi_lsa_pipeline(count_df, k): ##### YOUR CODE HERE ppmi_reweight_df = vsm.pmi(count_df) lsa_df = vsm.lsa(ppmi_reweight_df, k) results = full_word_similarity_evaluation(lsa_df) return results
def run_ppmi_lsa_pipeline(count_df, k): ##### YOUR CODE HERE ppmi_df = vsm.pmi(count_df) ppmi_df_lsa100 = vsm.lsa(ppmi_df, k) return full_word_similarity_evaluation(ppmi_df_lsa100)
return self.X[index], self.Y[index] def __len__(self): return self.len #defining a cosine loss function def cosine_loss(output, target): num = torch.mm(output, torch.t(target)) den = torch.sqrt(torch.sum(output**2) * torch.sum(target**2)) return (1 - num / den) #loading data and pre-processing giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"), index_col=0) giga = vsm.pmi(giga) giga = vsm.lsa(giga, k=750) #defining parameters num_epochs = 200 batch_size = 128 step_rate = 0.15 learning_rate = 1e-4 examples = giga.shape[0] features = giga.shape[1] #Preparing data step = int(features * step_rate) model = autoencoder(step) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)