Esempio n. 1
0
def augmentation_evaluation(df_train, df_test, policy, p=None):
    tfs = [
        TransformationFunction.change_addr,
        TransformationFunction.change_business,
        TransformationFunction.change_o,
        TransformationFunction.randomly_delete,
        TransformationFunction.randomly_add
    ]

    if policy == "random":
        random_policy = RandomPolicy(len(tfs),
                                     sequence_length=2,
                                     n_per_original=2,
                                     keep_original=True)
        tf_applier = PandasTFApplier(tfs, random_policy)
        df_train_augmented = tf_applier.apply(df_train)
        Y_train_augmented = df_train_augmented["label"].values
        print(f"Original training set size: {len(df_train)}")
        print(f"Augmented training set size: {len(df_train_augmented)}")
        return df_train_augmented, Y_train_augmented

    if policy == "mean":
        if p is None:
            p = [0.1, 0.1, 0.1, 0.35, 0.35]
        mean_field_policy = MeanFieldPolicy(
            len(tfs),
            sequence_length=
            2,  # how many TFs to apply uniformly at random per data point
            n_per_original=
            2,  # how many augmented data points to generate per original data point
            keep_original=True,
            p=p,  # specify a sampling distribution for the TFs
        )
        tf_applier = PandasTFApplier(tfs, mean_field_policy)
        df_train_augmented = tf_applier.apply(df_train)
        Y_train_augmented = df_train_augmented["label"].values
        print(f"Original training set size: {len(df_train)}")
        print(f"Augmented training set size: {len(df_train_augmented)}")
        return df_train_augmented, Y_train_augmented
Esempio n. 2
0
# In some cases, we can do better than uniform random sampling.
# We might have domain knowledge that some TFs should be applied more frequently than others,
# or have trained an [automated data augmentation model](https://snorkel.org/tanda/)
# that learned a sampling distribution for the TFs.
# Snorkel supports this use case with a
# [`MeanFieldPolicy`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/augmentation/snorkel.augmentation.MeanFieldPolicy.html),
# which allows you to specify a sampling distribution for the TFs.
# We give higher probabilities to the `replace_[X]_with_synonym` TFs, since those provide more information to the model.

# %%
from snorkel.augmentation import MeanFieldPolicy

mean_field_policy = MeanFieldPolicy(
    len(tfs),
    sequence_length=2,
    n_per_original=2,
    keep_original=True,
    p=[0.05, 0.05, 0.3, 0.3, 0.3],
)

# %% [markdown]
# To apply one or more TFs that we've written to a collection of data points according to our policy, we use a
# [`PandasTFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/augmentation/snorkel.augmentation.PandasTFApplier.html)
# because our data points are represented with a Pandas DataFrame.

# %% {"tags": ["md-exclude-output"]}
from snorkel.augmentation import PandasTFApplier

tf_applier = PandasTFApplier(tfs, mean_field_policy)
df_train_augmented = tf_applier.apply(df_train)
Y_train_augmented = df_train_augmented["label"].values
Esempio n. 3
0
 def test_mean_field_policy(self):
     policy = MeanFieldPolicy(2, sequence_length=2, p=[1, 0])
     n_samples = 100
     samples = [policy.generate() for _ in range(n_samples)]
     self.assertEqual(samples.count([0, 0]), n_samples)