コード例 #1
0
 def test_tf_applier_pandas_generator(self):
     df = self._get_x_df_with_str()
     policy = RandomPolicy(1,
                           sequence_length=2,
                           n_per_original=2,
                           keep_original=False)
     applier = PandasTFApplier([square], policy)
     gen = applier.apply_generator(df, batch_size=2)
     df_expected = [
         pd.DataFrame(
             {
                 "num": [1, 1, 16, 16],
                 "strs": ["x", "x", "y", "y"]
             },
             index=[0, 0, 1, 1],
         ),
         pd.DataFrame({
             "num": [81, 81],
             "strs": ["z", "z"]
         }, index=[2, 2]),
     ]
     for df_batch, df_batch_expected in zip(gen, df_expected):
         self.assertEqual(df_batch.num.dtype, "int64")
         pd.testing.assert_frame_equal(df_batch, df_batch_expected)
     pd.testing.assert_frame_equal(df, self._get_x_df_with_str())
コード例 #2
0
def main():
    df = pd.read_csv('../airbnb/reviews.tsv', sep='\t')

    newdf = df[['comments', 'Great (1) Not Great (0)']]
    newdf.columns = ['text', 'label']
    chunks = []
    labels = []
    buffer = []
    for i, row in newdf.iterrows():
        sents = nltk.sent_tokenize(row['text'])
        for sent in sents:
            buffer.append(sent)
            if (len(buffer)) % 3 == 0:
                chunks.append(" ".join(buffer))
                labels.append(row['label'])
                buffer = [buffer[random.randint(0, 2)]]
        if len(buffer) > 1:
            chunks.append(" ".join(buffer))
            labels.append(row['label'])
            buffer = []

    chunkedDf = pd.DataFrame({'text': chunks, 'label': labels})

    random_policy = RandomPolicy(len(tfs),
                                 sequence_length=4,
                                 n_per_original=1,
                                 keep_original=True)
    tf_applier = PandasTFApplier(tfs, random_policy)
    newdf_augmented = tf_applier.apply(chunkedDf)
    print(len(newdf))
    print(len(newdf_augmented))
    newdf_augmented.to_csv('airbnb_augmented.csv')
コード例 #3
0
 def test_tf_applier_pandas_modify_in_place(self):
     df = self._get_x_df_dict()
     policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
     applier = PandasTFApplier([modify_in_place], policy)
     df_augmented = applier.apply(df, progress_bar=False)
     idx = [0, 0, 0, 1, 1, 1, 2, 2, 2]
     df_expected = pd.DataFrame(
         dict(d=get_data_dict(DATA_IN_PLACE_EXPECTED)), index=idx)
     pd.testing.assert_frame_equal(df_augmented, df_expected)
     pd.testing.assert_frame_equal(df, self._get_x_df_dict())
コード例 #4
0
 def test_tf_applier_returns_none(self):
     df = self._get_x_df()
     policy = RandomPolicy(1,
                           sequence_length=2,
                           n_per_original=2,
                           keep_original=True)
     applier = PandasTFApplier([square_returns_none], policy)
     df_augmented = applier.apply(df, progress_bar=False)
     df_expected = pd.DataFrame(dict(num=[1, 1, 1, 2, 3, 81, 81]),
                                index=[0, 0, 0, 1, 2, 2, 2])
     self.assertEqual(df_augmented.num.dtype, "int64")
     pd.testing.assert_frame_equal(df_augmented, df_expected)
     pd.testing.assert_frame_equal(df, self._get_x_df())
コード例 #5
0
 def test_tf_applier_pandas_modify_in_place_generator(self):
     df = self._get_x_df_dict()
     policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
     applier = PandasTFApplier([modify_in_place], policy)
     gen = applier.apply_generator(df, batch_size=2)
     idx = [0, 0, 0, 1, 1, 1, 2, 2, 2]
     df_expected = [
         make_df(get_data_dict(DATA_IN_PLACE_EXPECTED[:6]), idx[:6], key="d"),
         make_df(get_data_dict(DATA_IN_PLACE_EXPECTED[6:]), idx[6:], key="d"),
     ]
     for df_batch, df_batch_expected in zip(gen, df_expected):
         pd.testing.assert_frame_equal(df_batch, df_batch_expected)
     pd.testing.assert_frame_equal(df, self._get_x_df_dict())
コード例 #6
0
 def test_tf_applier_returns_none_generator(self):
     df = self._get_x_df()
     policy = RandomPolicy(
         1, sequence_length=2, n_per_original=2, keep_original=True
     )
     applier = PandasTFApplier([square_returns_none], policy)
     gen = applier.apply_generator(df, batch_size=2)
     df_expected = [
         make_df([1, 1, 1, 2], [0, 0, 0, 1]),
         make_df([3, 81, 81], [2, 2, 2]),
     ]
     for df_batch, df_batch_expected in zip(gen, df_expected):
         pd.testing.assert_frame_equal(df_batch, df_batch_expected)
     pd.testing.assert_frame_equal(df, self._get_x_df())
コード例 #7
0
def apply_tf_on_data(df_train):
    """
    Here we apply the transformation functions (from transformation_function.py)
    on the given train data frame.
    Return the enlarged data frame.
    """
    print("")
    print("Transformation Functions:")
    tfs = [transformation_function.change_perek, transformation_function.change_masechet]
    random_policy = RandomPolicy(
        len(tfs), sequence_length=len(tfs), n_per_original=TRANSFORMATION_FACTOR, keep_original=True
    )
    print("-Applying ["+str(len(tfs))+"] transformation functions with factor ["+str(TRANSFORMATION_FACTOR)+"] ...")
    tf_applier = PandasTFApplier(tfs, random_policy)
    df_train_augmented = tf_applier.apply(df_train)
    # Y_train_augmented = df_train_augmented["tag"].values
    print("DONE")
    return df_train_augmented
コード例 #8
0
    def test_tf_applier_pandas(self):
        df = self._get_x_df_with_str()
        policy = RandomPolicy(1,
                              sequence_length=2,
                              n_per_original=1,
                              keep_original=False)
        applier = PandasTFApplier([square], policy)
        df_augmented = applier.apply(df, progress_bar=False)
        df_expected = pd.DataFrame(dict(num=[1, 16, 81], strs=STR_DATA),
                                   index=[0, 1, 2])
        self.assertEqual(df_augmented.num.dtype, "int64")
        pd.testing.assert_frame_equal(df_augmented, df_expected)
        pd.testing.assert_frame_equal(df, self._get_x_df_with_str())

        df_augmented = applier.apply(df, progress_bar=True)
        df_expected = pd.DataFrame(dict(num=[1, 16, 81], strs=STR_DATA),
                                   index=[0, 1, 2])
        pd.testing.assert_frame_equal(df_augmented, df_expected)
        pd.testing.assert_frame_equal(df, self._get_x_df_with_str())
コード例 #9
0
def augmentation_evaluation(df_train, df_test, policy, p=None):
    tfs = [
        TransformationFunction.change_addr,
        TransformationFunction.change_business,
        TransformationFunction.change_o,
        TransformationFunction.randomly_delete,
        TransformationFunction.randomly_add
    ]

    if policy == "random":
        random_policy = RandomPolicy(len(tfs),
                                     sequence_length=2,
                                     n_per_original=2,
                                     keep_original=True)
        tf_applier = PandasTFApplier(tfs, random_policy)
        df_train_augmented = tf_applier.apply(df_train)
        Y_train_augmented = df_train_augmented["label"].values
        print(f"Original training set size: {len(df_train)}")
        print(f"Augmented training set size: {len(df_train_augmented)}")
        return df_train_augmented, Y_train_augmented

    if policy == "mean":
        if p is None:
            p = [0.1, 0.1, 0.1, 0.35, 0.35]
        mean_field_policy = MeanFieldPolicy(
            len(tfs),
            sequence_length=
            2,  # how many TFs to apply uniformly at random per data point
            n_per_original=
            2,  # how many augmented data points to generate per original data point
            keep_original=True,
            p=p,  # specify a sampling distribution for the TFs
        )
        tf_applier = PandasTFApplier(tfs, mean_field_policy)
        df_train_augmented = tf_applier.apply(df_train)
        Y_train_augmented = df_train_augmented["label"].values
        print(f"Original training set size: {len(df_train)}")
        print(f"Augmented training set size: {len(df_train_augmented)}")
        return df_train_augmented, Y_train_augmented
コード例 #10
0
    words = x.text.lower().split()
    idx = random.choice(range(len(words)))
    synonyms = get_synonyms(words[idx])
    if len(synonyms) > 0:
        x.text = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1:])
        return x


# %% [markdown]
# Next, we apply this transformation function to our training dataset:

# %%
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier

tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)
df_train_augmented = tf_applier.apply(df_train)

# %% [markdown]
# Note that a common challenge with data augmentation is figuring out how to tune and apply different transformation functions to best augment a training set.
# This is most commonly done as an ad hoc manual process; however, in Snorkel, various approaches for using automatically learned data augmentation _policies_ are supported.
# For more detail, see the [Spam TFs tutorial](https://snorkel.org/use-cases/02-spam-data-augmentation-tutorial).

# %% [markdown]
# ## 4) Writing a Slicing Function
#
# Finally, a third operator in Snorkel, *slicing functions (SFs)*, handles the reality that many datasets have certain subsets or _slices_ that are more important than others.
# In Snorkel, we can write SFs to (a) monitor specific slices and (b) improve model performance over them by adding representational capacity targeted on a per-slice basis.
#
# Writing a slicing function is simple.
# For example, we could write one that looks for suspiciously shortened links, which might be critical due to their likelihood of linking to malicious sites:
コード例 #11
0
    len(tfs),
    sequence_length=2,
    n_per_original=2,
    keep_original=True,
    p=[0.05, 0.05, 0.3, 0.3, 0.3],
)

# %% [markdown]
# To apply one or more TFs that we've written to a collection of data points according to our policy, we use a
# [`PandasTFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/augmentation/snorkel.augmentation.PandasTFApplier.html)
# because our data points are represented with a Pandas DataFrame.

# %% {"tags": ["md-exclude-output"]}
from snorkel.augmentation import PandasTFApplier

tf_applier = PandasTFApplier(tfs, mean_field_policy)
df_train_augmented = tf_applier.apply(df_train)
Y_train_augmented = df_train_augmented["label"].values

# %%
print(f"Original training set size: {len(df_train)}")
print(f"Augmented training set size: {len(df_train_augmented)}")

# %% [markdown]
# We have almost doubled our dataset using TFs!
# Note that despite `n_per_original` being set to 2, our dataset may not exactly triple in size,
# because sometimes TFs return `None` instead of a new data point
# (e.g. `change_person` when applied to a sentence with no persons).
# If you prefer to have exact proportions for your dataset, you can have TFs that can't perform a
# valid transformation return the original data point rather than `None` (as they do here).