def augmentation_evaluation(df_train, df_test, policy, p=None): tfs = [ TransformationFunction.change_addr, TransformationFunction.change_business, TransformationFunction.change_o, TransformationFunction.randomly_delete, TransformationFunction.randomly_add ] if policy == "random": random_policy = RandomPolicy(len(tfs), sequence_length=2, n_per_original=2, keep_original=True) tf_applier = PandasTFApplier(tfs, random_policy) df_train_augmented = tf_applier.apply(df_train) Y_train_augmented = df_train_augmented["label"].values print(f"Original training set size: {len(df_train)}") print(f"Augmented training set size: {len(df_train_augmented)}") return df_train_augmented, Y_train_augmented if policy == "mean": if p is None: p = [0.1, 0.1, 0.1, 0.35, 0.35] mean_field_policy = MeanFieldPolicy( len(tfs), sequence_length= 2, # how many TFs to apply uniformly at random per data point n_per_original= 2, # how many augmented data points to generate per original data point keep_original=True, p=p, # specify a sampling distribution for the TFs ) tf_applier = PandasTFApplier(tfs, mean_field_policy) df_train_augmented = tf_applier.apply(df_train) Y_train_augmented = df_train_augmented["label"].values print(f"Original training set size: {len(df_train)}") print(f"Augmented training set size: {len(df_train_augmented)}") return df_train_augmented, Y_train_augmented
# In some cases, we can do better than uniform random sampling. # We might have domain knowledge that some TFs should be applied more frequently than others, # or have trained an [automated data augmentation model](https://snorkel.org/tanda/) # that learned a sampling distribution for the TFs. # Snorkel supports this use case with a # [`MeanFieldPolicy`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/augmentation/snorkel.augmentation.MeanFieldPolicy.html), # which allows you to specify a sampling distribution for the TFs. # We give higher probabilities to the `replace_[X]_with_synonym` TFs, since those provide more information to the model. # %% from snorkel.augmentation import MeanFieldPolicy mean_field_policy = MeanFieldPolicy( len(tfs), sequence_length=2, n_per_original=2, keep_original=True, p=[0.05, 0.05, 0.3, 0.3, 0.3], ) # %% [markdown] # To apply one or more TFs that we've written to a collection of data points according to our policy, we use a # [`PandasTFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/augmentation/snorkel.augmentation.PandasTFApplier.html) # because our data points are represented with a Pandas DataFrame. # %% {"tags": ["md-exclude-output"]} from snorkel.augmentation import PandasTFApplier tf_applier = PandasTFApplier(tfs, mean_field_policy) df_train_augmented = tf_applier.apply(df_train) Y_train_augmented = df_train_augmented["label"].values
def test_mean_field_policy(self): policy = MeanFieldPolicy(2, sequence_length=2, p=[1, 0]) n_samples = 100 samples = [policy.generate() for _ in range(n_samples)] self.assertEqual(samples.count([0, 0]), n_samples)