print('> Read files...') categories = os.listdir(data_dir) print('> Split data to test and train') documents, classes = get_texts(categories) train_docs, test_docs, train_classes, test_classes = train_test_split( documents, classes, train_size=0.7) classifier = NaiveBayesTextClassifier( categories=categories, min_df=1, lowercase=True, stop_words=stopwords.words('english') ) print('> Train classifier') classifier.train(train_docs, train_classes) print('> Classify test data...') predicted_classes = classifier.classify(test_docs) print('> Complete.') print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("{:<25}: {:>4} articles".format("Test data size", len(test_classes))) print("{:<25}: {:>6.2f} %".format( "Accuracy", 100 * accuracy_score(test_classes, predicted_classes)) ) print("{:<25}: {:>6.2f} %".format( "Kappa statistics", 100 * kappa( category_to_number(test_classes, categories),
categories=['1', '0', 't'], min_df=1, lowercase=True, # 127 English stop words stop_words=stopwords.words('english')) print("> Split data to test and train") train_docs, test_docs, train_classes, test_classes = non_shuffling_train_test_split( train_data['tweet'], train_data['label']) train_docs = train_docs.fillna('1') train_classes = train_classes.fillna('1') print(train_docs.isnull().any()) print(train_classes.isnull().any()) print(type(train_docs)) print("> Train classifier") classifier.train(train_docs, train_classes) total_docs = len(train_docs) print("-" * 42) print("Total", total_docs, " tweets") print("Number of words", classifier.bag.shape[1], " words") print("Parse time", time.time() - start_time, "seconds") print("-" * 42) # -------------- Classify --------------- # print("> Start classify data") start_time = time.time() test_docs = test_docs.fillna('1') test_classes = test_classes.fillna('1') predicted_classes = classifier.classify(test_docs) print((predicted_classes), (test_classes))
for category in categories: category_files_path = os.path.join(data_dir, category) text_ids = os.listdir(category_files_path) prepare_category_file = functools.partial(prepare_file, category_files_path) texts = [prepare_category_file(f) for f in text_ids] documents += texts classes += [category] * len(texts) return documents, classes print('Get Gategories...') categories = os.listdir(data_dir) print('Reading Data...') documents, classes = get_texts(categories) train_docs, test_docs, train_classes, test_classes = train_test_split(documents, classes, train_size=0.9) clf = NaiveBayesTextClassifier(categories=categories, min_df=1, lowercase=True, stop_words=stopwords.words('english')) print('Training...') clf.train(train_docs, train_classes) print('Predicting...') predicted_classes = clf.classify(test_docs) print('Result:') print('-' * 72) print(classification_report(test_classes, predicted_classes)) print('-' * 72)
classifier = NaiveBayesTextClassifier( categories=[0, 1], min_df=1, lowercase=True, # 127 English stop words stop_words=stopwords.words('english') ) if options.test: print("> Split data to test and train") train_docs, test_docs, train_classes, test_classes = train_test_split( train_data.review, train_data.sentiment, train_size=0.7 ) print("> Train classifier") classifier.train(train_docs, train_classes) total_docs = len(train_docs) elif options.predict: print("> Read test data") test_data = read_csv(TEST_DATA_FILE, sep='\t') print("> Train classifier") classifier.train(train_data.review, train_data.sentiment) total_docs = len(train_data) print("-" * 42) print("{:<25}: {:>6} articles".format("Total", total_docs)) print("{:<25}: {:>6} words".format( "Number of words", classifier.bag.shape[1] ))
print("> Init classifier") start_time = time.time() classifier = NaiveBayesTextClassifier( categories=[0, 1], min_df=1, lowercase=True, # 127 English stop words stop_words=stopwords.words('english')) if options.test: print("> Split data to test and train") train_docs, test_docs, train_classes, test_classes = train_test_split( train_data.review, train_data.sentiment, train_size=0.7) print("> Train classifier") classifier.train(train_docs, train_classes) total_docs = len(train_docs) elif options.predict: print("> Read test data") test_data = read_csv(TEST_DATA_FILE, sep='\t') print("> Train classifier") classifier.train(train_data.review, train_data.sentiment) total_docs = len(train_data) print("-" * 42) print("{:<25}: {:>6} articles".format("Total", total_docs)) print("{:<25}: {:>6} words".format("Number of words", classifier.bag.shape[1])) print("{:<25}: {:>6.2f} seconds".format("Parse time", time.time() - start_time))