Beispiel #1
0
    def classify_docs(self, classifier):
        english_text_preprocessor = EnglishTextPreProcessor()
        training_data = pnd.read_csv(Config.ENGLISH_TRAINING_DATA_DIR)
        testing_data = pnd.read_csv(Config.ENGLISH_TEST_DATA_DIR)
        tfidf = TfIdf("English", training_data, english_text_preprocessor)
        if classifier == 'KNN':
            clf = KNNClassifier(training_data, testing_data, tfidf)
        elif classifier == 'Naive Bayes':
            clf = NaiveBayesClassifier(training_data, testing_data, tfidf)
        elif classifier == 'SVM':
            clf = SVMClassifier(training_data, testing_data, tfidf)
        else:
            clf = RandomForestClassifier(training_data, testing_data, tfidf)

        predicted_classes = np.zeros(shape=(self.n_documents, ), dtype=int)
        for i, doc_words in enumerate(
                tqdm(self.document_words, position=0, leave=True)):
            str = ""
            for word in doc_words:
                str += word + " "
            if classifier == 'KNN':
                predicted_classes[i] = clf.predict(str[:-1], 5)
            else:
                predicted_classes[i] = clf.predict(str[:-1])

        return predicted_classes
Beispiel #2
0
        np.set_printoptions(precision=3)
        logger.info("On " + set_name + " Data")
        logger.info("K = {}:\n"
                    "\tAccuracy = {:.3f}\n"
                    "\tRecall_per class = {}\n"
                    "\tPrecision_per class = {}\n"
                    "\tmicro_F1 = {:.3f}\n"
                    "\tmacro_F1 = {:.3f}".format(k, accuracy, per_class_recall, per_class_precision,
                                                 micro_f1, macro_f1))
        return micro_f1

    def evaluate_validation_for_ks(self, ks):
        micro_f1_list = []
        for k in ks:
            micro_f1_list.append(self.evaluate(self.validation_docs_matrix, self.validation_labels, k,
                                               set_name="Validation"))
        return ks[np.argmax(micro_f1_list)]



if __name__ == '__main__':
    english_text_preprocessor = EnglishTextPreProcessor()
    training_data = pnd.read_csv(Config.ENGLISH_TRAINING_DATA_DIR)
    testing_data = pnd.read_csv(Config.ENGLISH_TEST_DATA_DIR)

    tfidf = TfIdf("English", training_data, english_text_preprocessor)

    knn_clf = KNNClassifier(training_data, testing_data, tfidf)
    # a = knn_clf.predict("Italy", 5)
    # print(a)