Beispiel #1
0
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Beispiel #2
0
def run_naive_bayes_bow_runtime_vocabulary(nbr, str_list):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_bayers(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "NB BOW runtime voc Avg f1: " + avg_f1.__str__(),
        "NB BOW runtime voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Beispiel #3
0
def run_lp_bow_vocabulary(nbr, str_list, gamma):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_true(nbr)
        vectorizer = CountVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelPropagation(kernel='rbf',
                               gamma=gamma).fit(vectors.todense(),
                                                dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "RBF BOW voc Avg f1: " + avg_f1.__str__(),
        "RBF BOW voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Beispiel #4
0
def run_naive_bayes_tfidf_vocabulary(nbr, str_list):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_bayers(nbr)
        vectorizer = TfidfVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "NB TF-IDF voc Avg f1: " + avg_f1.__str__(),
        "NB TF-IDF voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Beispiel #5
0
def print_v2_test_docs_vocabulary(categories):
    i = 0
    removed_test = 0
    print("Printing docs...")
    while i < len(categories):
        with open(
                '../assets/20newsgroups/test2vocabulary/newsgroups_test_' +
                categories[i] + '.txt', 'w') as f:
            lines = [
                line.rstrip('\n') for line in
                open('../assets/20newsgroups/test/newsgroups_test_' +
                     categories[i] + '.txt')
            ]
            j = 0
            dataset = Dataset(categories)
            vectorizer = CountVectorizer(
                vocabulary=voc.get_vocabulary(categories))
            vectors = vectorizer.fit_transform(dataset.train['data'])
            vocabulary = vectorizer.vocabulary_
            while j < len(lines):
                lines[j] = re.sub(r'[^\w]', " ", lines[j])
                lines[j] = re.sub(r'\b[a-zA-Z]\b', " ", lines[j])
                lines[j] = re.sub(r'[ \t]+', " ",
                                  lines[j])  # remove extra space or tab
                lines[j] = lines[j].strip() + "\n"
                remove_doc = 1
                words = lines[j].split()
                for word in words:
                    if word in vocabulary.keys():
                        remove_doc = 0
                        break
                size = len(lines[j])
                # lines[j] = lines[j][1:size]
                if len(lines[j]) > 4 and not remove_doc:
                    f.write(lines[j])
                else:
                    removed_test += 1
                j += 1
            f.close()
        i += 1
    print("Printing finished")
    print("Removed testing doc:", removed_test)