def test_build_dataset_vectorizing(): phi = lambda text: Counter(text.split()) split_df = sst.dev_reader(sst_home) dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=True) assert len(dataset['X']) == split_df.shape[0] assert len(dataset['y']) == len(dataset['X']) assert len(dataset['raw_examples']) == len(dataset['X'])
def test_build_dataset_not_vectorizing(): phi = lambda text: text split_df = sst.dev_reader(sst_home) dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=False) assert len(dataset['X']) == split_df.shape[0] assert dataset['X'] == dataset['raw_examples'] assert len(dataset['y']) == len(dataset['X'])
def hugging_face_bert_classifier_phi(tree): reps = hugging_face_bert_phi(tree) #return reps.mean(axis=0) # Another good, easy option. return reps[0] # This is very much like what we [summed the GloVe representations of these examples](sst_03_neural_networks.ipynb#Distributed-representations-as-features), but now the individual word representations are different depending on the context in which they appear. # Next we read in the SST train and dev portions as a lists of `(tree, label)` pairs: # In[21]: hf_train = list(sst.train_reader(SST_HOME, class_func=sst.ternary_class_func)) hf_dev = list(sst.dev_reader(SST_HOME, class_func=sst.ternary_class_func)) # Split the input/output pairs out into separate lists: # In[22]: X_hf_tree_train, y_hf_train = zip(*hf_train) X_hf_tree_dev, y_hf_dev = zip(*hf_dev) # In the next step, we featurize all of the examples. These steps are likely to be the slowest in these experiments: # In[23]:
from sklearn.linear_model import LogisticRegression import sst from torch_rnn_classifier import TorchRNNClassifier import pytest __author__ = "Christopher Potts" __version__ = "CS224u, Stanford, Spring 2019" sst_home = os.path.join('data', 'trees') @pytest.mark.parametrize("reader, count", [ [sst.train_reader(sst_home, class_func=None), 8544], [sst.train_reader(sst_home, class_func=sst.binary_class_func), 6920], [sst.train_reader(sst_home, class_func=sst.ternary_class_func), 8544], [sst.dev_reader(sst_home, class_func=None), 1101], [sst.dev_reader(sst_home, class_func=sst.binary_class_func), 872], [sst.dev_reader(sst_home, class_func=sst.ternary_class_func), 1101], ]) def test_readers(reader, count): result = len(list(reader)) assert result == count def test_reader_labeling(): tree, label = next( sst.train_reader(sst_home, class_func=sst.ternary_class_func)) for subtree in tree.subtrees(): assert subtree.label() in {'negative', 'neutral', 'positive'}
def test_build_rnn_dataset(): split_df = sst.dev_reader(sst_home) X, y = sst.build_rnn_dataset(split_df) assert len(X) == 1101 assert len(y) == 1101
__author__ = "Christopher Potts" __version__ = "CS224u, Stanford, Spring 2021" utils.fix_random_seeds() sst_home = os.path.join('data', 'sentiment') @pytest.mark.parametrize( "split_df, expected_count", [[sst.train_reader(sst_home, include_subtrees=True, dedup=False), 318582], [sst.train_reader(sst_home, include_subtrees=True, dedup=True), 159274], [sst.train_reader(sst_home, include_subtrees=False, dedup=False), 8544], [sst.train_reader(sst_home, include_subtrees=False, dedup=True), 8534], [sst.dev_reader(sst_home, include_subtrees=True, dedup=False), 1101], [sst.dev_reader(sst_home, include_subtrees=True, dedup=True), 1100], [sst.dev_reader(sst_home, include_subtrees=False, dedup=False), 1101], [sst.dev_reader(sst_home, include_subtrees=False, dedup=True), 1100]]) def test_readers(split_df, expected_count): result = split_df.shape[0] assert result == expected_count def test_build_dataset_vectorizing(): phi = lambda text: Counter(text.split()) split_df = sst.dev_reader(sst_home) dataset = sst.build_dataset(split_df, phi, vectorizer=None, vectorize=True) assert len(dataset['X']) == split_df.shape[0] assert len(dataset['y']) == len(dataset['X']) assert len(dataset['raw_examples']) == len(dataset['X'])