Beispiel #1
0
def test_build_dataset_vectorizing():
    phi = lambda text: Counter(text.split())
    split_df = sst.dev_reader(sst_home)
    dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=True)
    assert len(dataset['X']) == split_df.shape[0]
    assert len(dataset['y']) == len(dataset['X'])
    assert len(dataset['raw_examples']) == len(dataset['X'])
Beispiel #2
0
def test_build_dataset_not_vectorizing():
    phi = lambda text: text
    split_df = sst.dev_reader(sst_home)
    dataset = sst.build_dataset(split_df,
                                phi,
                                vectorizer=None,
                                vectorize=False)
    assert len(dataset['X']) == split_df.shape[0]
    assert dataset['X'] == dataset['raw_examples']
    assert len(dataset['y']) == len(dataset['X'])
Beispiel #3
0
def test_build_dataset_not_vectorizing():
    phi = lambda tree: tree
    class_func = None
    reader = sst.train_reader
    dataset = sst.build_dataset(reader,
                                phi,
                                class_func,
                                vectorizer=None,
                                vectorize=False)
    assert len(dataset['X']) == len(list(reader()))
    assert dataset['X'] == dataset['raw_examples']
    assert len(dataset['y']) == len(dataset['X'])
Beispiel #4
0
def test_build_dataset_vectorizing():
    phi = lambda tree: Counter(tree.leaves())
    class_func = None
    reader = sst.train_reader
    dataset = sst.build_dataset(reader,
                                phi,
                                class_func,
                                vectorizer=None,
                                vectorize=True)
    assert len(dataset['X']) == len(list(reader()))
    assert len(dataset['y']) == len(dataset['X'])
    assert len(dataset['raw_examples']) == len(dataset['X'])
Beispiel #5
0
def run_experiment(eta, embed, model, phrase):
    print("===================================================")
    print("eta: %s, embed_dim: %s, model: %s, phrase-level: %s" %
          (eta, embed, model, phrase))
    print("===================================================")

    if embed == 100:
        glove_lookup = glove_lookup_100
    elif embed == 200:
        glove_lookup = glove_lookup_200
    elif embed == 300:
        glove_lookup = glove_lookup_300

    if model == "lifted":
        base_model = tf_lifted_trnn.TfLiftedTreeRNNClassifier
    else:
        base_model = tf_trnn.TfTreeRNNClassifier

    start = time.time()
    train = sst.build_dataset(sst.train_reader,
                              lambda x: x,
                              sst.ternary_class_func,
                              vectorizer=None,
                              vectorize=False,
                              subtree_labels=phrase)
    # Manage the assessment set-up:
    X_train = train['X']
    y_train = train['y']
    X_assess = None
    y_assess = None
    # Assessment dataset using the training vectorizer:
    assess = sst.build_dataset(sst.dev_reader,
                               lambda x: x,
                               sst.ternary_class_func,
                               vectorizer=train['vectorizer'],
                               vectorize=False,
                               subtree_labels=phrase)
    X_assess, y_assess = assess['X'], assess['y']

    test = sst.build_dataset(sst.test_reader,
                             lambda x: x,
                             sst.ternary_class_func,
                             vectorizer=train['vectorizer'],
                             vectorize=False,
                             subtree_labels=phrase)
    X_test, y_test = test['X'], test['y']

    tree_vocab = ["unk"] + sst.get_vocab(map(lambda x: x.leaves(), X_train),
                                         n_words=5000)
    embedding = np.asarray(
        [glove_lookup.get(k, glove_lookup["unk"]) for k in tree_vocab])
    model = base_model(tree_vocab,
                       eta=eta,
                       batch_size=16,
                       embed_dim=embed,
                       hidden_dim=embed,
                       max_length=120,
                       max_iter=10,
                       embedding=embedding,
                       reg=0.0,
                       train_embedding=True,
                       use_phrases=phrase)
    model.fit(X_train, y_train, X_assess=X_assess, y_assess=y_assess)
    # Ensure that restoring works properly!
    #model.restore(y_assess)
    #model.test(X_assess, y_assess)
    # Actual test.
    model.restore(y_test)
    model.test(X_test, y_test)
    end = time.time()
    print("Time: %s" % (end - start))
Beispiel #6
0
import numpy as np
import pandas as pd
from mittens import GloVe
import vsm
import sst

sst.build_dataset()

vsm.ngram_vsm()

pd.DataFrame


def dice_distance(u, v):
    np.min(u, v)
    np.hstack(u, v)
# ## Building datasets for experiments
#
# The second major phase for our analysis is a kind of set-up phase. Ingredients:
#
# * A reader like `train_reader`
# * A feature function like `unigrams_phi`
# * A class function like `binary_class_func`
#
# The convenience function `sst.build_dataset` uses these to build a dataset for training and assessing a model. See its documentation for details on how it works. Much of this is about taking advantage of `sklearn`'s many functions for model building.

# In[7]:

train_dataset = sst.build_dataset(SST_HOME,
                                  reader=sst.train_reader,
                                  phi=unigrams_phi,
                                  class_func=sst.binary_class_func,
                                  vectorizer=None)

# In[9]:

train_dataset['X'].shape

# In[10]:

print(
    "Train dataset with unigram features has {:,} examples and {:,} features".
    format(*train_dataset['X'].shape))

# Notice that `sst.build_dataset` has an optional argument `vectorizer`:
#