documents, classes = get_texts(categories) train_docs, test_docs, train_classes, test_classes = train_test_split( documents, classes, train_size=0.7) classifier = NaiveBayesTextClassifier( categories=categories, min_df=1, lowercase=True, stop_words=stopwords.words('english') ) print('> Train classifier') classifier.train(train_docs, train_classes) print('> Classify test data...') predicted_classes = classifier.classify(test_docs) print('> Complete.') print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("{:<25}: {:>4} articles".format("Test data size", len(test_classes))) print("{:<25}: {:>6.2f} %".format( "Accuracy", 100 * accuracy_score(test_classes, predicted_classes)) ) print("{:<25}: {:>6.2f} %".format( "Kappa statistics", 100 * kappa( category_to_number(test_classes, categories), category_to_number(predicted_classes, categories) ) ))
print("{:<25}: {:>6} articles".format("Total", total_docs)) print("{:<25}: {:>6} words".format( "Number of words", classifier.bag.shape[1] )) print("{:<25}: {:>6.2f} seconds".format( "Parse time", time.time() - start_time )) print("-" * 42) # -------------- Classify --------------- # print("> Start classify data") start_time = time.time() if options.test: predicted_classes = classifier.classify(test_docs) print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("{:<25}: {:>6} articles".format("Test data size", len(test_classes))) print("{:<25}: {:>6.2f} %".format( "Accuracy", 100 * accuracy_score(test_classes, predicted_classes)) ) print("{:<25}: {:>6.2f} %".format( "Kappa statistics", 100 * kappa(test_classes, predicted_classes) )) elif options.predict: predicted_classes = classifier.classify(test_data.review) print("> Save predicted results")
train_docs, test_docs, train_classes, test_classes = non_shuffling_train_test_split( train_data['tweet'], train_data['label']) train_docs = train_docs.fillna('1') train_classes = train_classes.fillna('1') print(train_docs.isnull().any()) print(train_classes.isnull().any()) print(type(train_docs)) print("> Train classifier") classifier.train(train_docs, train_classes) total_docs = len(train_docs) print("-" * 42) print("Total", total_docs, " tweets") print("Number of words", classifier.bag.shape[1], " words") print("Parse time", time.time() - start_time, "seconds") print("-" * 42) # -------------- Classify --------------- # print("> Start classify data") start_time = time.time() test_docs = test_docs.fillna('1') test_classes = test_classes.fillna('1') predicted_classes = classifier.classify(test_docs) print((predicted_classes), (test_classes)) print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("Test data size", len(test_classes), "articles") print("Accuracy", 100 * accuracy_score(test_classes, predicted_classes), "%") end_time = time.time() print("Computation time", end_time - start_time, "seconds") print('-' * 42)
total_docs = len(train_data) print("-" * 42) print("{:<25}: {:>6} articles".format("Total", total_docs)) print("{:<25}: {:>6} words".format("Number of words", classifier.bag.shape[1])) print("{:<25}: {:>6.2f} seconds".format("Parse time", time.time() - start_time)) print("-" * 42) # -------------- Classify --------------- # print("> Start classify data") start_time = time.time() if options.test: predicted_classes = classifier.classify(test_docs) print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("{:<25}: {:>6} articles".format("Test data size", len(test_classes))) print("{:<25}: {:>6.2f} %".format( "Accuracy", 100 * accuracy_score(test_classes, predicted_classes))) print("{:<25}: {:>6.2f} %".format( "Kappa statistics", 100 * kappa(test_classes, predicted_classes))) elif options.predict: predicted_classes = classifier.classify(test_data.review) print("> Save predicted results") print("> {}".format(PREDICTED_DATA_FILE)) np.savetxt(PREDICTED_DATA_FILE,