Exemple #1
0
def remove_frequent_and_infrequent_words(newsgroup):
    vectorizer = CountVectorizer(max_df=0.5, min_df=10)
    vectors = vectorizer.fit_transform(newsgroup['data'])
    vocabulary = voc.get_top_n_words(newsgroup['data'],
                                     len(vectorizer.vocabulary_))
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform(newsgroup['data'])
    vocabulary_with_freq_and_infreq = voc.get_top_n_words(
        newsgroup['data'], len(vectorizer.vocabulary_))
    i = 0
    while i < len(vocabulary_with_freq_and_infreq):
        vocabulary_with_freq_and_infreq[i] = vocabulary_with_freq_and_infreq[
            i][0]
        if i < len(vocabulary):
            vocabulary[i] = vocabulary[i][0]
        i += 1
    print(len(vocabulary))
    print(len(vocabulary_with_freq_and_infreq))
    remove = []
    i = 0
    while i < len(vocabulary_with_freq_and_infreq):
        if vocabulary_with_freq_and_infreq[i] not in vocabulary:
            remove.append(vocabulary_with_freq_and_infreq[i])
        i += 1
    remove = "|".join(remove)
    i = 0
    while i < len(newsgroup['data']):
        newsgroup['data'][i] = re.sub(r'\b(' + remove + ')\s', ' ',
                                      newsgroup['data'][i])
        i += 1
        print("Document: ", i, "/", len(newsgroup['data']))
    """
Exemple #2
0
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Exemple #3
0
def run_naive_bayes_bow_runtime_vocabulary(nbr, str_list):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_bayers(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "NB BOW runtime voc Avg f1: " + avg_f1.__str__(),
        "NB BOW runtime voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Exemple #4
0
def run_naive_bayes_tfidf_vocabulary(nbr, str_list):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_bayers(nbr)
        vectorizer = TfidfVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "NB TF-IDF voc Avg f1: " + avg_f1.__str__(),
        "NB TF-IDF voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Exemple #5
0
def run_lp_bow_vocabulary(nbr, str_list, gamma):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_true(nbr)
        vectorizer = CountVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelPropagation(kernel='rbf',
                               gamma=gamma).fit(vectors.todense(),
                                                dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "RBF BOW voc Avg f1: " + avg_f1.__str__(),
        "RBF BOW voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Exemple #6
0
def print_v2_test_docs_vocabulary_labeled(categories):
    i = 0
    removed_test = 0
    print("Printing docs...")
    while i < len(categories):
        with open(
                '../assets/20newsgroups/test2vocabulary_labeled/newsgroups_test_'
                + categories[i] + '.txt', 'w') as f:
            lines = [
                line.rstrip('\n') for line in
                open('../assets/20newsgroups/test/newsgroups_test_' +
                     categories[i] + '.txt')
            ]
            j = 0
            dataset = Dataset(categories)
            vectorizer = CountVectorizer(
                vocabulary=voc.get_vocabulary_only_labeled(categories))
            vectors = vectorizer.fit_transform(dataset.train['data'])
            vocabulary = vectorizer.vocabulary_
            while j < len(lines):
                lines[j] = re.sub(r'[^\w]', " ", lines[j])
                lines[j] = re.sub(r'\b[a-zA-Z]\b', " ", lines[j])
                lines[j] = re.sub(r'[ \t]+', " ",
                                  lines[j])  # remove extra space or tab
                lines[j] = lines[j].strip() + "\n"
                remove_doc = 1
                words = lines[j].split()
                for word in words:
                    if word in vocabulary.keys():
                        remove_doc = 0
                        break
                size = len(lines[j])
                # lines[j] = lines[j][1:size]
                if len(lines[j]) > 4 and not remove_doc:
                    f.write(lines[j])
                else:
                    removed_test += 1
                j += 1
            f.close()
        i += 1
    print("Printing finished")
    print("Removed testing doc:", removed_test)
Exemple #7
0
categories = [
    'alt.atheism', 'comp.graphics', 'rec.autos', 'sci.space',
    'talk.politics.guns'
]

# initialize dataset
dataset = Dataset(categories)
dataset.load_preprocessed(categories)
dataset.split_train_bayers(100)
print_v2_test_docs_vocabulary_labeled(categories)
dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)

# feature extraction
vectorizer = TfidfVectorizer(
    vocabulary=voc.get_vocabulary_only_labeled(categories))
vectors = vectorizer.fit_transform(dataset.train['data'])
print(len(vectorizer.vocabulary_))
clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
test_vec = vectorizer.transform(dataset.test['data'])
pred = clf.predict(test_vec.todense())

print('f1 score Naive Bayes: ',
      metrics.f1_score(dataset.test['target'], pred, average='macro'))
print('clf score Naive Bayes: ',
      clf.score(test_vec.todense(), dataset.test['target']))

np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plot_confusion_matrix(
    dataset.test['target'],