return documents, classes def category_to_number(classes, category_type): return list(map(category_type.index, classes)) print('> Read files...') categories = os.listdir(data_dir) print('> Split data to test and train') documents, classes = get_texts(categories) train_docs, test_docs, train_classes, test_classes = train_test_split( documents, classes, train_size=0.7) classifier = NaiveBayesTextClassifier( categories=categories, min_df=1, lowercase=True, stop_words=stopwords.words('english') ) print('> Train classifier') classifier.train(train_docs, train_classes) print('> Classify test data...') predicted_classes = classifier.classify(test_docs) print('> Complete.') print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("{:<25}: {:>4} articles".format("Test data size", len(test_classes))) print("{:<25}: {:>6.2f} %".format(
important_files = (TEST_DATA_FILE, LABELED_TRAIN_DATA_FILE) for tsv_file in important_files: if not os.path.exists(tsv_file): raise EnvironmentError("File {} doesn't exist at {}.".format( ntpath.basename(tsv_file), DATADIR )) print("> Read train data") train_data = read_csv(LABELED_TRAIN_DATA_FILE, sep='\t') print("> Init classifier") start_time = time.time() classifier = NaiveBayesTextClassifier( categories=[0, 1], min_df=1, lowercase=True, # 127 English stop words stop_words=stopwords.words('english') ) if options.test: print("> Split data to test and train") train_docs, test_docs, train_classes, test_classes = train_test_split( train_data.review, train_data.sentiment, train_size=0.7 ) print("> Train classifier") classifier.train(train_docs, train_classes) total_docs = len(train_docs) elif options.predict:
def non_shuffling_train_test_split(X, y, test_size=0.2): i = int((1 - test_size) * X.shape[0]) + 1 X_train, X_test = np.split(X, [i]) y_train, y_test = np.split(y, [i]) return X_train, X_test, y_train, y_test print("> Read train data") train_data = read_csv('train.csv') print("> Init classifier") start_time = time.time() classifier = NaiveBayesTextClassifier( categories=['1', '0', 't'], min_df=1, lowercase=True, # 127 English stop words stop_words=stopwords.words('english')) print("> Split data to test and train") train_docs, test_docs, train_classes, test_classes = non_shuffling_train_test_split( train_data['tweet'], train_data['label']) train_docs = train_docs.fillna('1') train_classes = train_classes.fillna('1') print(train_docs.isnull().any()) print(train_classes.isnull().any()) print(type(train_docs)) print("> Train classifier") classifier.train(train_docs, train_classes) total_docs = len(train_docs) print("-" * 42)
"and put it in {}.".format(DATADIR)) important_files = (TEST_DATA_FILE, LABELED_TRAIN_DATA_FILE) for tsv_file in important_files: if not os.path.exists(tsv_file): raise EnvironmentError("File {} doesn't exist at {}.".format( ntpath.basename(tsv_file), DATADIR)) print("> Read train data") train_data = read_csv(LABELED_TRAIN_DATA_FILE, sep='\t') print("> Init classifier") start_time = time.time() classifier = NaiveBayesTextClassifier( categories=[0, 1], min_df=1, lowercase=True, # 127 English stop words stop_words=stopwords.words('english')) if options.test: print("> Split data to test and train") train_docs, test_docs, train_classes, test_classes = train_test_split( train_data.review, train_data.sentiment, train_size=0.7) print("> Train classifier") classifier.train(train_docs, train_classes) total_docs = len(train_docs) elif options.predict: print("> Read test data") test_data = read_csv(TEST_DATA_FILE, sep='\t')