def submit(): if request.form['text_input'] == "" or len( request.form['text_input']) < 10: return "Please provide input large enough, Classifier can understand :)" # todo: change column name to be dynamically taken from training file test_data = pd.DataFrame([request.form['text_input']], columns=['Document']) session_id = request.cookies['session'] path = os.path.join(app.config['UPLOAD_FOLDER'], session_id) trained_classifier = [i for i in os.listdir(path) if '.pkl' in i] vectorizer = os.path.join(path, 'tfidf_vectorizer.pk') tfidf_transformer = joblib.load(vectorizer) pre_processor = PreProcess(test_data, column_name='Document') test_data = pre_processor.clean_html() test_data = pre_processor.remove_non_ascii() test_data = pre_processor.remove_spaces() test_data = pre_processor.remove_punctuation() test_data = pre_processor.stemming() test_data = pre_processor.lemmatization() test_data = pre_processor.stop_words() test_data1 = tfidf_transformer.transform(test_data.Document) result = {} for clf in trained_classifier: model = joblib.load(os.path.join(path, clf)) print(clf, model.predict(test_data1)[0]) classifier_name = clf.split('/')[-1].split('.')[0] result[classifier_name] = model.predict(test_data1)[0] print(result) return render_template('results.html', result=result)
def main(): data = get_data( '/Users/aditya1/Documents/Document_Classification/bbc-dataset') ############################################################################### # Data Pre-processing steps ############################################################################### column_name = data.columns[0] # print(column_name) pre_processor = PreProcess(data, column_name) # todo: change code to provide all functions in class definition. pre_processor_operations = ['clean_html'] data = pre_processor.clean_html() data = pre_processor.remove_non_ascii() data = pre_processor.remove_spaces() data = pre_processor.remove_punctuation() data = pre_processor.stemming() data = pre_processor.lemmatization() data = pre_processor.stop_words() ############################################################################### # Feature extraction ############################################################################### train_x, test_x, train_y, test_y = train_test_split(data.Document, data.Category, test_size=0.20) # print(train_x.shape, train_y.shape) # print(test_x.shape, test_y.shape) tfidf_transformer = TfidfVectorizer(min_df=1) train_vectors = tfidf_transformer.fit_transform(train_x) joblib.dump(tfidf_transformer, 'vectorizer.pkl') test_vectors = tfidf_transformer.transform(test_x) print(data.head()) ############################################################################### # Perform classification with SVM, kernel=linear model1 = svm.SVC(kernel='linear') model1.fit(train_vectors, train_y) joblib.dump(model1, 'SVM.pkl') y_pred_class = model1.predict(test_vectors) print(metrics.accuracy_score(test_y, y_pred_class)) print("Prediction score for classifier %s:\n%s\n" % (model1, metrics.accuracy_score(test_y, y_pred_class))) print("Classification report for classifier %s:\n%s\n" % (model1, metrics.classification_report(test_y, y_pred_class))) model2 = MultinomialNB() model2.fit(train_vectors, train_y) joblib.dump(model2, 'MultinomialNB.pkl') y_pred_class = model2.predict(test_vectors) print("Accuracy score:", metrics.accuracy_score(test_y, y_pred_class)) print("Confusion Matrix for classifier %s:\n%s\n" % (model2, metrics.confusion_matrix(test_y, y_pred_class))) print("Classification report for classifier %s:\n%s\n" % (model2, metrics.classification_report(test_y, y_pred_class)))
def read_process_data(path, files_path): data = pd.read_csv(path) column_name = data.columns[0] # print(column_name) pre_processor = PreProcess(data, column_name) # todo: change code to provide all functions in class definition. data = pre_processor.clean_html() data = pre_processor.remove_non_ascii() data = pre_processor.remove_spaces() data = pre_processor.remove_punctuation() data = pre_processor.stemming() data = pre_processor.lemmatization() data = pre_processor.stop_words() train_x, test_x, train_y, test_y = train_test_split(data.Document, data.Category, test_size=0.20) tfidf_transformer = TfidfVectorizer(min_df=1) train_vectors = tfidf_transformer.fit_transform(train_x) vectorizer_path = os.path.join(files_path, 'tfidf_vectorizer.pk') joblib.dump(tfidf_transformer, vectorizer_path) return train_vectors, train_y