def test_build_dataset_vectorizing(): phi = lambda text: Counter(text.split()) split_df = sst.dev_reader(sst_home) dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=True) assert len(dataset['X']) == split_df.shape[0] assert len(dataset['y']) == len(dataset['X']) assert len(dataset['raw_examples']) == len(dataset['X'])
def test_build_dataset_not_vectorizing(): phi = lambda text: text split_df = sst.dev_reader(sst_home) dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=False) assert len(dataset['X']) == split_df.shape[0] assert dataset['X'] == dataset['raw_examples'] assert len(dataset['y']) == len(dataset['X'])
def test_build_dataset_not_vectorizing(): phi = lambda tree: tree class_func = None reader = sst.train_reader dataset = sst.build_dataset(reader, phi, class_func, vectorizer=None, vectorize=False) assert len(dataset['X']) == len(list(reader())) assert dataset['X'] == dataset['raw_examples'] assert len(dataset['y']) == len(dataset['X'])
def test_build_dataset_vectorizing(): phi = lambda tree: Counter(tree.leaves()) class_func = None reader = sst.train_reader dataset = sst.build_dataset(reader, phi, class_func, vectorizer=None, vectorize=True) assert len(dataset['X']) == len(list(reader())) assert len(dataset['y']) == len(dataset['X']) assert len(dataset['raw_examples']) == len(dataset['X'])
def run_experiment(eta, embed, model, phrase): print("===================================================") print("eta: %s, embed_dim: %s, model: %s, phrase-level: %s" % (eta, embed, model, phrase)) print("===================================================") if embed == 100: glove_lookup = glove_lookup_100 elif embed == 200: glove_lookup = glove_lookup_200 elif embed == 300: glove_lookup = glove_lookup_300 if model == "lifted": base_model = tf_lifted_trnn.TfLiftedTreeRNNClassifier else: base_model = tf_trnn.TfTreeRNNClassifier start = time.time() train = sst.build_dataset(sst.train_reader, lambda x: x, sst.ternary_class_func, vectorizer=None, vectorize=False, subtree_labels=phrase) # Manage the assessment set-up: X_train = train['X'] y_train = train['y'] X_assess = None y_assess = None # Assessment dataset using the training vectorizer: assess = sst.build_dataset(sst.dev_reader, lambda x: x, sst.ternary_class_func, vectorizer=train['vectorizer'], vectorize=False, subtree_labels=phrase) X_assess, y_assess = assess['X'], assess['y'] test = sst.build_dataset(sst.test_reader, lambda x: x, sst.ternary_class_func, vectorizer=train['vectorizer'], vectorize=False, subtree_labels=phrase) X_test, y_test = test['X'], test['y'] tree_vocab = ["unk"] + sst.get_vocab(map(lambda x: x.leaves(), X_train), n_words=5000) embedding = np.asarray( [glove_lookup.get(k, glove_lookup["unk"]) for k in tree_vocab]) model = base_model(tree_vocab, eta=eta, batch_size=16, embed_dim=embed, hidden_dim=embed, max_length=120, max_iter=10, embedding=embedding, reg=0.0, train_embedding=True, use_phrases=phrase) model.fit(X_train, y_train, X_assess=X_assess, y_assess=y_assess) # Ensure that restoring works properly! #model.restore(y_assess) #model.test(X_assess, y_assess) # Actual test. model.restore(y_test) model.test(X_test, y_test) end = time.time() print("Time: %s" % (end - start))
import numpy as np import pandas as pd from mittens import GloVe import vsm import sst sst.build_dataset() vsm.ngram_vsm() pd.DataFrame def dice_distance(u, v): np.min(u, v) np.hstack(u, v)
# ## Building datasets for experiments # # The second major phase for our analysis is a kind of set-up phase. Ingredients: # # * A reader like `train_reader` # * A feature function like `unigrams_phi` # * A class function like `binary_class_func` # # The convenience function `sst.build_dataset` uses these to build a dataset for training and assessing a model. See its documentation for details on how it works. Much of this is about taking advantage of `sklearn`'s many functions for model building. # In[7]: train_dataset = sst.build_dataset(SST_HOME, reader=sst.train_reader, phi=unigrams_phi, class_func=sst.binary_class_func, vectorizer=None) # In[9]: train_dataset['X'].shape # In[10]: print( "Train dataset with unigram features has {:,} examples and {:,} features". format(*train_dataset['X'].shape)) # Notice that `sst.build_dataset` has an optional argument `vectorizer`: #