def test_reader__remove_duplicates(self): """Check whether remove_duplicates method removes duplicates from dataset based on the email column.""" NUMBER_DUP = 2 reader = DSReader(dataset_dup_path) before_remove = reader.dataset.shape[0] reader.remove_duplicates() self.assertEqual(reader.dataset.shape[0], before_remove - NUMBER_DUP)
# my_dataset.to_lower() # my_dataset.remove_digits() # my_dataset.remove_punctuation_marks() # my_dataset.remove_duplicates() # my_dataset.remove_stopwords() # my_dataset.remove_stopwords() # print(my_dataset.dataset) my_dataset1 = DSReader('C:/Users/Masquerade/Downloads/emails.csv') my_dataset1.to_lower() my_dataset1.remove_digits() my_dataset1.remove_punctuation_marks() my_dataset1.remove_duplicates() my_dataset1.remove_stopwords() my_dataset1.remove_stopwords() # print(my_dataset1.dataset) list_email, list_label = my_dataset1.vectorize() print(list_email.shape) print(list_label.shape) X, y = list_email, list_label # X, y = my_dataset1.dataset.email, my_dataset1.dataset.label X_train, X_test, y_train, y_test = train_test_split(X.values, y.values) print("______________________________________________") print(y_test)