def test_reader__to_lower(self): """Check whether to_lower method convert all emails in dataset to lower case.""" reader = DSReader(dataset_capital) reader.to_lower() self.assertEqual( True, all([line.email.islower() for line in reader.dataset.itertuples()]))
def test_reader__remove_punctuation_marks(self): """Check whether remove_punctuation_marks deletes all punctuation marks from the dataset""" reader = DSReader(dataset_punctuation_marks) reader.to_lower() reader.remove_punctuation_marks() for i, row in reader.dataset.iterrows(): for word in row['email'].split(' '): self.assertEqual( all([mark not in word for mark in string.punctuation]), True)
my_data_test1 = os.path.abspath('../tests/datasets/test_dataset_1_digits.csv') # my_dataset = DSReader(my_data_test) # my_dataset.to_lower() # my_dataset.remove_digits() # my_dataset.remove_punctuation_marks() # my_dataset.remove_duplicates() # my_dataset.remove_stopwords() # my_dataset.remove_stopwords() # print(my_dataset.dataset) my_dataset1 = DSReader('C:/Users/Masquerade/Downloads/emails.csv') my_dataset1.to_lower() my_dataset1.remove_digits() my_dataset1.remove_punctuation_marks() my_dataset1.remove_duplicates() my_dataset1.remove_stopwords() my_dataset1.remove_stopwords() # print(my_dataset1.dataset) list_email, list_label = my_dataset1.vectorize() print(list_email.shape) print(list_label.shape) X, y = list_email, list_label # X, y = my_dataset1.dataset.email, my_dataset1.dataset.label