def test_predict_functions_honor_device(random_matrix, func): X = random_matrix mod = TorchAutoencoder(hidden_dim=5, max_iter=2) mod.fit(X) prediction_func = getattr(mod, func) with pytest.raises(RuntimeError): prediction_func(X, device="FAKE_DEVICE")
def test_simple_example_params(random_low_rank_matrix, param, expected): X = random_low_rank_matrix ae = TorchAutoencoder() H = ae.fit(X) X_pred = ae.predict(X) r2 = ae.score(X) assert r2 > 0.92
def test_hidden_activation_in_graph(random_matrix): X = random_matrix mod = TorchAutoencoder(max_iter=1, hidden_activation=nn.ReLU()) mod.fit(X) mod_hidden_activation = mod.hidden_activation.__class__ graph_activation_class = mod.model[1].__class__ assert mod_hidden_activation == graph_activation_class
def test_model_graph_dimensions(random_matrix, attr, layer_index, weight_dim): X = random_matrix mod = TorchAutoencoder(max_iter=1) mod.fit(X) mod_attr_val = getattr(mod, attr) graph_dim = mod.model[layer_index].weight.shape[weight_dim] assert mod_attr_val == graph_dim
def test_predict_functions_restore_device(random_matrix, func): X = random_matrix mod = TorchAutoencoder(hidden_dim=5, max_iter=2) mod.fit(X) current_device = mod.device assert current_device != torch.device("cpu:0") prediction_func = getattr(mod, func) prediction_func(X, device="cpu:0") assert mod.device == current_device
def test_build_dataset(random_matrix, with_y, expected): X = random_matrix mod = TorchAutoencoder() if with_y: dataset = mod.build_dataset(X, X) else: dataset = mod.build_dataset(X) result = next(iter(dataset)) assert len(result) == expected
def autoencoder_evaluation(nrow=1000, ncol=100, rank=20, max_iter=20000): """This an evaluation in which `TfAutoencoder` should be able to perfectly reconstruct the input data, because the hidden representations have the same dimensionality as the rank of the input matrix. """ X = randmatrix(nrow, rank).dot(randmatrix(rank, ncol)) ae = TorchAutoencoder(hidden_dim=rank, max_iter=max_iter) ae.fit(X) X_pred = ae.predict(X) mse = (0.5 * (X_pred - X)**2).mean() return (X, X_pred, mse)
def test_model(random_matrix, model_class, pandas): """Just makes sure that this code will run; it doesn't check that it is creating good models. """ X = random_matrix if pandas: X = pd.DataFrame(X) ae = TorchAutoencoder(hidden_dim=5, max_iter=100) H = ae.fit(X) ae.predict(X) H_is_pandas = isinstance(H, pd.DataFrame) assert H_is_pandas == pandas
def test_save_load(random_matrix): X = random_matrix mod = TorchAutoencoder(hidden_dim=5, max_iter=2) mod.fit(X) mod.predict(X) with tempfile.NamedTemporaryFile(mode='wb') as f: name = f.name mod.to_pickle(name) mod2 = TorchAutoencoder.from_pickle(name) mod2.predict(X) mod2.fit(X)
def model(): df = pd.read_csv(os.path.join(VSM_HOME, 'imdb_window5-scaled.csv.gz'), index_col=0) df = vsm.pmi(df) df = ttest(df) # df = subword_enrichment(df) df = df.apply(vsm.length_norm, axis=1) df = TorchAutoencoder(hidden_dim=500, max_iter=200, eta=1e-3).fit(df) return full_word_similarity_evaluation(df)
if 'IS_GRADESCOPE_ENV' not in os.environ: import torch from torch_autoencoder import TorchAutoencoder if torch.cuda.is_available(): device = "cuda" else: device = "cpu" print("Using device: {}".format(device)) for hidden_dim in (50, 150, 300, 500): for l2_strength in (0.0, 0.5, 1.0): print("Dim: {}, l2: {}".format(hidden_dim, l2_strength)) ae = TorchAutoencoder(max_iter=100, hidden_dim=hidden_dim, eta=0.1, l2_strength=l2_strength, device=device) giga20_ppmi_ae = ae.fit(giga20_ppmi) display(full_word_similarity_evaluation(giga20_ppmi_ae)) # %% if 'IS_GRADESCOPE_ENV' not in os.environ: print("Using device: {}".format(device)) for hidden_dim in (50, 150, 300, 500): for l2_strength in (0.0, 0.5, 1.0): print("Dim: {}, l2: {}".format(hidden_dim, l2_strength)) ae = TorchAutoencoder(max_iter=100, hidden_dim=hidden_dim, eta=0.1,
print("Autoencoder evaluation MSE after {0} evaluations: {1:0.04f}".format( ae_max_iter, ae)) # ### Applying autoencoders to real VSMs # # You can apply the autoencoder directly to the count matrix, but this could interact very badly with the internal activation function: if the counts are all very high or very low, then everything might get pushed irrevocably towards the extreme values of the activation. # # Thus, it's a good idea to first normalize the values somehow. Here, I use `vsm.length_norm`: # In[30]: imdb5_l2 = imdb5.apply(vsm.length_norm, axis=1) # In[31]: imdb5_l2_ae = TorchAutoencoder(max_iter=100, hidden_dim=50, eta=0.001).fit(imdb5_l2) # In[32]: vsm.neighbors('superb', imdb5_l2_ae).head() # This is very slow and seems not to work all that well. To speed things up, one can first apply LSA or similar: # In[33]: imdb5_l2_svd100 = vsm.lsa(imdb5_l2, k=100) # In[34]: imdb_l2_svd100_ae = TorchAutoencoder(max_iter=1000, hidden_dim=50, eta=0.01).fit(imdb5_l2_svd100)
def test_parameter_setting(param, expected): mod = TorchAutoencoder() mod.set_params(**{param: expected}) result = getattr(mod, param) assert result == expected
def test_params(param, expected): mod = TorchAutoencoder(**{param: expected}) result = getattr(mod, param) assert result == expected
def test_build_dataset_input_dim(random_matrix, early_stopping): X = random_matrix mod = TorchAutoencoder(early_stopping=early_stopping) dataset = mod.build_dataset(X) assert mod.input_dim == X.shape[1]