def test_reader__remove_stopwords(self):
        """Check whether remove_stopwords deletes all 'low weight' words
        from the dataset"""

        reader = DSReader(dataset_stopwords)
        reader.remove_stopwords()
        stop_words = tuple(nltk.corpus.stopwords.words('english'))
        for i, row in reader.dataset.iterrows():
            for word in row['email'].split(' '):
                self.assertEqual(word.lower() not in stop_words, True)
Ejemplo n.º 2
0
# my_dataset.to_lower()
# my_dataset.remove_digits()
# my_dataset.remove_punctuation_marks()
# my_dataset.remove_duplicates()
# my_dataset.remove_stopwords()
# my_dataset.remove_stopwords()

# print(my_dataset.dataset)

my_dataset1 = DSReader('C:/Users/Masquerade/Downloads/emails.csv')

my_dataset1.to_lower()
my_dataset1.remove_digits()
my_dataset1.remove_punctuation_marks()
my_dataset1.remove_duplicates()
my_dataset1.remove_stopwords()
my_dataset1.remove_stopwords()

# print(my_dataset1.dataset)

list_email, list_label = my_dataset1.vectorize()
print(list_email.shape)
print(list_label.shape)

X, y = list_email, list_label
# X, y = my_dataset1.dataset.email, my_dataset1.dataset.label

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values)
print("______________________________________________")
print(y_test)
print("______________________________________________")