DISPLAY_ALL_TEXT = False pd.set_option("display.max_colwidth", 0 if DISPLAY_ALL_TEXT else 50) # %% [markdown] {"tags": ["md-exclude"]} # This next cell makes sure a spaCy English model is downloaded. # If this is your first time downloading this model, restart the kernel after executing the next cell. # %% {"tags": ["md-exclude"]} # Download the spaCy english model # ! python -m spacy download en_core_web_sm # %% from utils import load_spam_dataset df_train, df_dev, df_valid, df_test = load_spam_dataset() # We pull out the label vectors for ease of use later Y_dev = df_dev.label.values Y_valid = df_valid.label.values Y_test = df_test.label.values # %% [markdown] # Let's view 5 data points from the `dev` set. # %% df_dev.sample(5, random_state=3) # %% [markdown] # The class distribution varies slightly between `SPAM` and `HAM`, but they're approximately class-balanced.
DISPLAY_ALL_TEXT = False pd.set_option("display.max_colwidth", 0 if DISPLAY_ALL_TEXT else 50) # %% [markdown] {"tags": ["md-exclude"]} # This next cell makes sure a spaCy English model is downloaded. # If this is your first time downloading this model, restart the kernel after executing the next cell. # %% {"tags": ["md-exclude"]} # Download the spaCy english model # ! python -m spacy download en_core_web_sm # %% from utils import load_spam_dataset df_train, df_test = load_spam_dataset() # We pull out the label vectors for ease of use later Y_test = df_test.label.values # %% [markdown] # The class distribution varies slightly between `SPAM` and `HAM`, but they're approximately class-balanced. # %% # For clarity, we define constants to represent the class labels for spam, ham, and abstaining. ABSTAIN = -1 HAM = 0 SPAM = 1 # %% [markdown] # ## 2. Writing Labeling Functions (LFs)
# %% [markdown] # ## 1. Loading Data # %% [markdown] # We load the Kaggle dataset and create Pandas DataFrame objects for each of the sets described above. # The two main columns in the DataFrames are: # * **`text`**: Raw text content of the comment # * **`label`**: Whether the comment is `SPAM` (1) or `HAM` (0). # # For more details, check out the [labeling tutorial](https://github.com/snorkel-team/snorkel-tutorials/blob/master/spam/01_spam_tutorial.ipynb). # %% from utils import load_spam_dataset df_train, _, df_valid, df_test = load_spam_dataset(load_train_labels=True) # We pull out the label vectors for ease of use later Y_valid = df_valid["label"].values Y_train = df_train["label"].values Y_test = df_test["label"].values # %% df_train.head() # %% [markdown] # ## 2. Writing Transformation Functions (TFs) # # Transformation functions are functions that can be applied to a training data point to create another valid training data point of the same class. # For example, for image classification problems, it is common to rotate or crop images in the training data to create new training inputs. # Transformation functions should be atomic e.g. a small rotation of an image, or changing a single word in a sentence.
# %% {"tags": ["md-exclude"]} import pandas as pd DISPLAY_ALL_TEXT = False pd.set_option("display.max_colwidth", 0 if DISPLAY_ALL_TEXT else 50) # %% [markdown] # _Note:_ this tutorial differs from the labeling tutorial in that we use ground truth labels in the train split for demo purposes. # SFs are intended to be used *after the training set has already been labeled* by LFs (or by hand) in the training data pipeline. # %% from utils import load_spam_dataset df_train, df_valid, df_test = load_spam_dataset(load_train_labels=True, split_dev=False) # %% [markdown] # ## 1. Write slicing functions # # We leverage *slicing functions* (SFs), which output binary _masks_ indicating whether an data point is in the slice or not. # Each slice represents some noisily-defined subset of the data (corresponding to an SF) that we'd like to programmatically monitor. # %% [markdown] # In the following cells, we use the [`@slicing_function()`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/slicing/snorkel.slicing.slicing_function.html#snorkel.slicing.slicing_function) decorator to initialize an SF that identifies shortened links the spam dataset. # These links could redirect us to potentially dangerous websites, and we don't want our users to click them! # To select the subset of shortened links in our dataset, we write a regex that checks for the commonly-used `.ly` extension. # # You'll notice that the `short_link` SF is a heuristic, like the other programmatic ops we've defined, and may not fully cover the slice of interest. # That's okay — in last section, we'll show how a model can handle this in Snorkel.