def test_tf_applier_pandas_generator(self): df = self._get_x_df_with_str() policy = RandomPolicy(1, sequence_length=2, n_per_original=2, keep_original=False) applier = PandasTFApplier([square], policy) gen = applier.apply_generator(df, batch_size=2) df_expected = [ pd.DataFrame( { "num": [1, 1, 16, 16], "strs": ["x", "x", "y", "y"] }, index=[0, 0, 1, 1], ), pd.DataFrame({ "num": [81, 81], "strs": ["z", "z"] }, index=[2, 2]), ] for df_batch, df_batch_expected in zip(gen, df_expected): self.assertEqual(df_batch.num.dtype, "int64") pd.testing.assert_frame_equal(df_batch, df_batch_expected) pd.testing.assert_frame_equal(df, self._get_x_df_with_str())
def main(): df = pd.read_csv('../airbnb/reviews.tsv', sep='\t') newdf = df[['comments', 'Great (1) Not Great (0)']] newdf.columns = ['text', 'label'] chunks = [] labels = [] buffer = [] for i, row in newdf.iterrows(): sents = nltk.sent_tokenize(row['text']) for sent in sents: buffer.append(sent) if (len(buffer)) % 3 == 0: chunks.append(" ".join(buffer)) labels.append(row['label']) buffer = [buffer[random.randint(0, 2)]] if len(buffer) > 1: chunks.append(" ".join(buffer)) labels.append(row['label']) buffer = [] chunkedDf = pd.DataFrame({'text': chunks, 'label': labels}) random_policy = RandomPolicy(len(tfs), sequence_length=4, n_per_original=1, keep_original=True) tf_applier = PandasTFApplier(tfs, random_policy) newdf_augmented = tf_applier.apply(chunkedDf) print(len(newdf)) print(len(newdf_augmented)) newdf_augmented.to_csv('airbnb_augmented.csv')
def test_tf_applier_pandas_modify_in_place(self): df = self._get_x_df_dict() policy = ApplyOnePolicy(n_per_original=2, keep_original=True) applier = PandasTFApplier([modify_in_place], policy) df_augmented = applier.apply(df, progress_bar=False) idx = [0, 0, 0, 1, 1, 1, 2, 2, 2] df_expected = pd.DataFrame( dict(d=get_data_dict(DATA_IN_PLACE_EXPECTED)), index=idx) pd.testing.assert_frame_equal(df_augmented, df_expected) pd.testing.assert_frame_equal(df, self._get_x_df_dict())
def test_tf_applier_returns_none(self): df = self._get_x_df() policy = RandomPolicy(1, sequence_length=2, n_per_original=2, keep_original=True) applier = PandasTFApplier([square_returns_none], policy) df_augmented = applier.apply(df, progress_bar=False) df_expected = pd.DataFrame(dict(num=[1, 1, 1, 2, 3, 81, 81]), index=[0, 0, 0, 1, 2, 2, 2]) self.assertEqual(df_augmented.num.dtype, "int64") pd.testing.assert_frame_equal(df_augmented, df_expected) pd.testing.assert_frame_equal(df, self._get_x_df())
def test_tf_applier_pandas_modify_in_place_generator(self): df = self._get_x_df_dict() policy = ApplyOnePolicy(n_per_original=2, keep_original=True) applier = PandasTFApplier([modify_in_place], policy) gen = applier.apply_generator(df, batch_size=2) idx = [0, 0, 0, 1, 1, 1, 2, 2, 2] df_expected = [ make_df(get_data_dict(DATA_IN_PLACE_EXPECTED[:6]), idx[:6], key="d"), make_df(get_data_dict(DATA_IN_PLACE_EXPECTED[6:]), idx[6:], key="d"), ] for df_batch, df_batch_expected in zip(gen, df_expected): pd.testing.assert_frame_equal(df_batch, df_batch_expected) pd.testing.assert_frame_equal(df, self._get_x_df_dict())
def test_tf_applier_returns_none_generator(self): df = self._get_x_df() policy = RandomPolicy( 1, sequence_length=2, n_per_original=2, keep_original=True ) applier = PandasTFApplier([square_returns_none], policy) gen = applier.apply_generator(df, batch_size=2) df_expected = [ make_df([1, 1, 1, 2], [0, 0, 0, 1]), make_df([3, 81, 81], [2, 2, 2]), ] for df_batch, df_batch_expected in zip(gen, df_expected): pd.testing.assert_frame_equal(df_batch, df_batch_expected) pd.testing.assert_frame_equal(df, self._get_x_df())
def apply_tf_on_data(df_train): """ Here we apply the transformation functions (from on the given train data frame. Return the enlarged data frame. """ print("") print("Transformation Functions:") tfs = [transformation_function.change_perek, transformation_function.change_masechet] random_policy = RandomPolicy( len(tfs), sequence_length=len(tfs), n_per_original=TRANSFORMATION_FACTOR, keep_original=True ) print("-Applying ["+str(len(tfs))+"] transformation functions with factor ["+str(TRANSFORMATION_FACTOR)+"] ...") tf_applier = PandasTFApplier(tfs, random_policy) df_train_augmented = tf_applier.apply(df_train) # Y_train_augmented = df_train_augmented["tag"].values print("DONE") return df_train_augmented
def test_tf_applier_pandas(self): df = self._get_x_df_with_str() policy = RandomPolicy(1, sequence_length=2, n_per_original=1, keep_original=False) applier = PandasTFApplier([square], policy) df_augmented = applier.apply(df, progress_bar=False) df_expected = pd.DataFrame(dict(num=[1, 16, 81], strs=STR_DATA), index=[0, 1, 2]) self.assertEqual(df_augmented.num.dtype, "int64") pd.testing.assert_frame_equal(df_augmented, df_expected) pd.testing.assert_frame_equal(df, self._get_x_df_with_str()) df_augmented = applier.apply(df, progress_bar=True) df_expected = pd.DataFrame(dict(num=[1, 16, 81], strs=STR_DATA), index=[0, 1, 2]) pd.testing.assert_frame_equal(df_augmented, df_expected) pd.testing.assert_frame_equal(df, self._get_x_df_with_str())
def augmentation_evaluation(df_train, df_test, policy, p=None): tfs = [ TransformationFunction.change_addr, TransformationFunction.change_business, TransformationFunction.change_o, TransformationFunction.randomly_delete, TransformationFunction.randomly_add ] if policy == "random": random_policy = RandomPolicy(len(tfs), sequence_length=2, n_per_original=2, keep_original=True) tf_applier = PandasTFApplier(tfs, random_policy) df_train_augmented = tf_applier.apply(df_train) Y_train_augmented = df_train_augmented["label"].values print(f"Original training set size: {len(df_train)}") print(f"Augmented training set size: {len(df_train_augmented)}") return df_train_augmented, Y_train_augmented if policy == "mean": if p is None: p = [0.1, 0.1, 0.1, 0.35, 0.35] mean_field_policy = MeanFieldPolicy( len(tfs), sequence_length= 2, # how many TFs to apply uniformly at random per data point n_per_original= 2, # how many augmented data points to generate per original data point keep_original=True, p=p, # specify a sampling distribution for the TFs ) tf_applier = PandasTFApplier(tfs, mean_field_policy) df_train_augmented = tf_applier.apply(df_train) Y_train_augmented = df_train_augmented["label"].values print(f"Original training set size: {len(df_train)}") print(f"Augmented training set size: {len(df_train_augmented)}") return df_train_augmented, Y_train_augmented
words = x.text.lower().split() idx = random.choice(range(len(words))) synonyms = get_synonyms(words[idx]) if len(synonyms) > 0: x.text = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1:]) return x # %% [markdown] # Next, we apply this transformation function to our training dataset: # %% from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True) tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy) df_train_augmented = tf_applier.apply(df_train) # %% [markdown] # Note that a common challenge with data augmentation is figuring out how to tune and apply different transformation functions to best augment a training set. # This is most commonly done as an ad hoc manual process; however, in Snorkel, various approaches for using automatically learned data augmentation _policies_ are supported. # For more detail, see the [Spam TFs tutorial]( # %% [markdown] # ## 4) Writing a Slicing Function # # Finally, a third operator in Snorkel, *slicing functions (SFs)*, handles the reality that many datasets have certain subsets or _slices_ that are more important than others. # In Snorkel, we can write SFs to (a) monitor specific slices and (b) improve model performance over them by adding representational capacity targeted on a per-slice basis. # # Writing a slicing function is simple. # For example, we could write one that looks for suspiciously shortened links, which might be critical due to their likelihood of linking to malicious sites:
len(tfs), sequence_length=2, n_per_original=2, keep_original=True, p=[0.05, 0.05, 0.3, 0.3, 0.3], ) # %% [markdown] # To apply one or more TFs that we've written to a collection of data points according to our policy, we use a # [`PandasTFApplier`]( # because our data points are represented with a Pandas DataFrame. # %% {"tags": ["md-exclude-output"]} from snorkel.augmentation import PandasTFApplier tf_applier = PandasTFApplier(tfs, mean_field_policy) df_train_augmented = tf_applier.apply(df_train) Y_train_augmented = df_train_augmented["label"].values # %% print(f"Original training set size: {len(df_train)}") print(f"Augmented training set size: {len(df_train_augmented)}") # %% [markdown] # We have almost doubled our dataset using TFs! # Note that despite `n_per_original` being set to 2, our dataset may not exactly triple in size, # because sometimes TFs return `None` instead of a new data point # (e.g. `change_person` when applied to a sentence with no persons). # If you prefer to have exact proportions for your dataset, you can have TFs that can't perform a # valid transformation return the original data point rather than `None` (as they do here).