def remove_frequent_and_infrequent_words(newsgroup): vectorizer = CountVectorizer(max_df=0.5, min_df=10) vectors = vectorizer.fit_transform(newsgroup['data']) vocabulary = voc.get_top_n_words(newsgroup['data'], len(vectorizer.vocabulary_)) vectorizer = CountVectorizer() vectors = vectorizer.fit_transform(newsgroup['data']) vocabulary_with_freq_and_infreq = voc.get_top_n_words( newsgroup['data'], len(vectorizer.vocabulary_)) i = 0 while i < len(vocabulary_with_freq_and_infreq): vocabulary_with_freq_and_infreq[i] = vocabulary_with_freq_and_infreq[ i][0] if i < len(vocabulary): vocabulary[i] = vocabulary[i][0] i += 1 print(len(vocabulary)) print(len(vocabulary_with_freq_and_infreq)) remove = [] i = 0 while i < len(vocabulary_with_freq_and_infreq): if vocabulary_with_freq_and_infreq[i] not in vocabulary: remove.append(vocabulary_with_freq_and_infreq[i]) i += 1 remove = "|".join(remove) i = 0 while i < len(newsgroup['data']): newsgroup['data'][i] = re.sub(r'\b(' + remove + ')\s', ' ', newsgroup['data'][i]) i += 1 print("Document: ", i, "/", len(newsgroup['data'])) """
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def run_naive_bayes_bow_runtime_vocabulary(nbr, str_list): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_bayers(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer( vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = MultinomialNB().fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy / 10 avg_f1 = avg_f1 / 10 str_list.extend([ "NB BOW runtime voc Avg f1: " + avg_f1.__str__(), "NB BOW runtime voc Avg acc: " + avg_accuracy.__str__() ]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def run_naive_bayes_tfidf_vocabulary(nbr, str_list): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_bayers(nbr) vectorizer = TfidfVectorizer( vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = MultinomialNB().fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy / 10 avg_f1 = avg_f1 / 10 str_list.extend([ "NB TF-IDF voc Avg f1: " + avg_f1.__str__(), "NB TF-IDF voc Avg acc: " + avg_accuracy.__str__() ]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def run_lp_bow_vocabulary(nbr, str_list, gamma): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_true(nbr) vectorizer = CountVectorizer( vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelPropagation(kernel='rbf', gamma=gamma).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy / 10 avg_f1 = avg_f1 / 10 str_list.extend([ "RBF BOW voc Avg f1: " + avg_f1.__str__(), "RBF BOW voc Avg acc: " + avg_accuracy.__str__() ]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def print_v2_test_docs_vocabulary_labeled(categories): i = 0 removed_test = 0 print("Printing docs...") while i < len(categories): with open( '../assets/20newsgroups/test2vocabulary_labeled/newsgroups_test_' + categories[i] + '.txt', 'w') as f: lines = [ line.rstrip('\n') for line in open('../assets/20newsgroups/test/newsgroups_test_' + categories[i] + '.txt') ] j = 0 dataset = Dataset(categories) vectorizer = CountVectorizer( vocabulary=voc.get_vocabulary_only_labeled(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) vocabulary = vectorizer.vocabulary_ while j < len(lines): lines[j] = re.sub(r'[^\w]', " ", lines[j]) lines[j] = re.sub(r'\b[a-zA-Z]\b', " ", lines[j]) lines[j] = re.sub(r'[ \t]+', " ", lines[j]) # remove extra space or tab lines[j] = lines[j].strip() + "\n" remove_doc = 1 words = lines[j].split() for word in words: if word in vocabulary.keys(): remove_doc = 0 break size = len(lines[j]) # lines[j] = lines[j][1:size] if len(lines[j]) > 4 and not remove_doc: f.write(lines[j]) else: removed_test += 1 j += 1 f.close() i += 1 print("Printing finished") print("Removed testing doc:", removed_test)
categories = [ 'alt.atheism', 'comp.graphics', 'rec.autos', 'sci.space', 'talk.politics.guns' ] # initialize dataset dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_bayers(100) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) # feature extraction vectorizer = TfidfVectorizer( vocabulary=voc.get_vocabulary_only_labeled(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) print(len(vectorizer.vocabulary_)) clf = MultinomialNB().fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) print('f1 score Naive Bayes: ', metrics.f1_score(dataset.test['target'], pred, average='macro')) print('clf score Naive Bayes: ', clf.score(test_vec.todense(), dataset.test['target'])) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plot_confusion_matrix( dataset.test['target'],