def test_reader__vectorize_empty(self): """Check vectorize method on the empty""" reader = DSReader(dataset_empty_path) reader.make_dictionary() X, y = reader.vectorize() self.assertEqual([], X.tolist()) self.assertEqual([], y.tolist())
def test_reader__remove_stopwords(self): """Check whether remove_stopwords deletes all 'low weight' words from the dataset""" reader = DSReader(dataset_stopwords) reader.remove_stopwords() stop_words = tuple(nltk.corpus.stopwords.words('english')) for i, row in reader.dataset.iterrows(): for word in row['email'].split(' '): self.assertEqual(word.lower() not in stop_words, True)
def test_reader__remove_digits(self): """Check the remove_digits method. All emails should not contains any digit.""" reader = DSReader(dataset_digits) reader.remove_digits() for i, row in reader.dataset.iterrows(): for word in row['email'].split(' '): res = any([digit in str(word) for digit in string.digits]) self.assertEqual(False, res)
def test_reader__to_lower(self): """Check whether to_lower method convert all emails in dataset to lower case.""" reader = DSReader(dataset_capital) reader.to_lower() self.assertEqual( True, all([line.email.islower() for line in reader.dataset.itertuples()]))
def test_reader__remove_duplicates(self): """Check whether remove_duplicates method removes duplicates from dataset based on the email column.""" NUMBER_DUP = 2 reader = DSReader(dataset_dup_path) before_remove = reader.dataset.shape[0] reader.remove_duplicates() self.assertEqual(reader.dataset.shape[0], before_remove - NUMBER_DUP)
def test_reader__make_dictionary_valid(self): """Check make_dictionary method""" result = [ "During", "this", "webinar", "we", "will", "cover", "what", "is", "DevOps", "and", "Cloud", "Native", "New", "came", "up", "storage" ] reader = DSReader(dataset_path) reader.make_dictionary() self.assertEqual(len(set(reader.dictionary)), len(result)) self.assertEqual(sorted(reader.dictionary), sorted(result))
def test_reader__vectorize(self): """Check vectorize method""" x_result = [ [ 'During this webinar we will cover what is DevOps and Cloud Native' ], ['New webinar came up'], [ 'During this webinar we will cover what is DevOps and Cloud Native and storage' ] ] y_result = [1, 1, 0] reader = DSReader(dataset_path) reader.make_dictionary() X, y = reader.vectorize() for i, row in enumerate(X): self.assertEqual(x_result[i], row.tolist()) self.assertEqual(y_result, y.tolist())
def test_reader__split_test_and_train_data_zero_size(self): """Check split_test_and_train_data method with argument size equals to zero""" reader = DSReader(dataset_split_path) reader.make_dictionary() X, y = reader.vectorize() percent = 0 with self.assertRaises(Exception): X_train, y_train, X_test, y_test = reader.split_train_and_test( X, y, percent)
def test_reader__split_test_and_train_data_empty(self): """Check split_test_and_train_data method on the empty dataset""" reader = DSReader(dataset_empty_path) reader.make_dictionary() X, y = reader.vectorize() percent = 0.7 with self.assertRaises(Exception): X_train, y_train, X_test, y_test = reader.split_train_and_test( X, y, percent)
def test_reader__remove_punctuation_marks(self): """Check whether remove_punctuation_marks deletes all punctuation marks from the dataset""" reader = DSReader(dataset_punctuation_marks) reader.to_lower() reader.remove_punctuation_marks() for i, row in reader.dataset.iterrows(): for word in row['email'].split(' '): self.assertEqual( all([mark not in word for mark in string.punctuation]), True)
def test_reader__split_test_and_train_data(self): """Check split_test_and_train_data method""" reader = DSReader(dataset_split_path) reader.make_dictionary() X, y = reader.vectorize() percent = 0.7 X_train, y_train, X_test, y_test = reader.split_train_and_test( X, y, percent) self.assertEqual(X_train.shape[0], X.shape[0] * percent) self.assertEqual(X_test.shape[0], X.shape[0] * round(1 - percent, 2)) self.assertEqual(y_train.shape[0], y.shape[0] * percent) self.assertEqual(y_test.shape[0], y.shape[0] * round(1 - percent, 2))
my_data_test = os.path.abspath('../tests/datasets/my_test_dataset.csv') my_data_test1 = os.path.abspath('../tests/datasets/test_dataset_1_digits.csv') # my_dataset = DSReader(my_data_test) # my_dataset.to_lower() # my_dataset.remove_digits() # my_dataset.remove_punctuation_marks() # my_dataset.remove_duplicates() # my_dataset.remove_stopwords() # my_dataset.remove_stopwords() # print(my_dataset.dataset) my_dataset1 = DSReader('C:/Users/Masquerade/Downloads/emails.csv') my_dataset1.to_lower() my_dataset1.remove_digits() my_dataset1.remove_punctuation_marks() my_dataset1.remove_duplicates() my_dataset1.remove_stopwords() my_dataset1.remove_stopwords() # print(my_dataset1.dataset) list_email, list_label = my_dataset1.vectorize() print(list_email.shape) print(list_label.shape) X, y = list_email, list_label
def test_reader__make_dictionary_empty(self): """Check make_dictionary method on the empty dataset""" reader = DSReader(dataset_empty_path) reader.make_dictionary() self.assertEqual(0, len(reader.dictionary))