Esempio n. 1
0
def test_build_dataset_vectorizing():
    phi = lambda text: Counter(text.split())
    split_df = sst.dev_reader(sst_home)
    dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=True)
    assert len(dataset['X']) == split_df.shape[0]
    assert len(dataset['y']) == len(dataset['X'])
    assert len(dataset['raw_examples']) == len(dataset['X'])
Esempio n. 2
0
def test_build_dataset_not_vectorizing():
    phi = lambda text: text
    split_df = sst.dev_reader(sst_home)
    dataset = sst.build_dataset(split_df,
                                phi,
                                vectorizer=None,
                                vectorize=False)
    assert len(dataset['X']) == split_df.shape[0]
    assert dataset['X'] == dataset['raw_examples']
    assert len(dataset['y']) == len(dataset['X'])
Esempio n. 3
0
def hugging_face_bert_classifier_phi(tree):
    reps = hugging_face_bert_phi(tree)
    #return reps.mean(axis=0)  # Another good, easy option.
    return reps[0]


# This is very much like what we [summed the GloVe representations of these examples](sst_03_neural_networks.ipynb#Distributed-representations-as-features), but now the individual word representations are different depending on the context in which they appear.

# Next we read in the SST train and dev portions as a lists of `(tree, label)` pairs:

# In[21]:


hf_train = list(sst.train_reader(SST_HOME, class_func=sst.ternary_class_func))

hf_dev = list(sst.dev_reader(SST_HOME, class_func=sst.ternary_class_func))


# Split the input/output pairs out into separate lists:

# In[22]:


X_hf_tree_train, y_hf_train = zip(*hf_train)

X_hf_tree_dev, y_hf_dev = zip(*hf_dev)


# In the next step, we featurize all of the examples. These steps are likely to be the slowest in these experiments:

# In[23]:
from sklearn.linear_model import LogisticRegression
import sst
from torch_rnn_classifier import TorchRNNClassifier
import pytest

__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2019"

sst_home = os.path.join('data', 'trees')


@pytest.mark.parametrize("reader, count", [
    [sst.train_reader(sst_home, class_func=None), 8544],
    [sst.train_reader(sst_home, class_func=sst.binary_class_func), 6920],
    [sst.train_reader(sst_home, class_func=sst.ternary_class_func), 8544],
    [sst.dev_reader(sst_home, class_func=None), 1101],
    [sst.dev_reader(sst_home, class_func=sst.binary_class_func), 872],
    [sst.dev_reader(sst_home, class_func=sst.ternary_class_func), 1101],
])
def test_readers(reader, count):
    result = len(list(reader))
    assert result == count


def test_reader_labeling():
    tree, label = next(
        sst.train_reader(sst_home, class_func=sst.ternary_class_func))
    for subtree in tree.subtrees():
        assert subtree.label() in {'negative', 'neutral', 'positive'}

Esempio n. 5
0
def test_build_rnn_dataset():
    split_df = sst.dev_reader(sst_home)
    X, y = sst.build_rnn_dataset(split_df)
    assert len(X) == 1101
    assert len(y) == 1101
Esempio n. 6
0
__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2021"

utils.fix_random_seeds()

sst_home = os.path.join('data', 'sentiment')


@pytest.mark.parametrize(
    "split_df, expected_count",
    [[sst.train_reader(sst_home, include_subtrees=True, dedup=False), 318582],
     [sst.train_reader(sst_home, include_subtrees=True, dedup=True), 159274],
     [sst.train_reader(sst_home, include_subtrees=False, dedup=False), 8544],
     [sst.train_reader(sst_home, include_subtrees=False, dedup=True), 8534],
     [sst.dev_reader(sst_home, include_subtrees=True, dedup=False), 1101],
     [sst.dev_reader(sst_home, include_subtrees=True, dedup=True), 1100],
     [sst.dev_reader(sst_home, include_subtrees=False, dedup=False), 1101],
     [sst.dev_reader(sst_home, include_subtrees=False, dedup=True), 1100]])
def test_readers(split_df, expected_count):
    result = split_df.shape[0]
    assert result == expected_count


def test_build_dataset_vectorizing():
    phi = lambda text: Counter(text.split())
    split_df = sst.dev_reader(sst_home)
    dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=True)
    assert len(dataset['X']) == split_df.shape[0]
    assert len(dataset['y']) == len(dataset['X'])
    assert len(dataset['raw_examples']) == len(dataset['X'])