def setUp(self): self.train = pd.read_csv(train_file, nrows=TestLinearPredictor.number_of_rows) self.test = pd.read_csv(test_file, nrows=TestLinearPredictor.number_of_rows) self.y_train = {tag: self.train[tag].values for tag in utils.TAGS} self.logistic_predictor = LogisticPredictor( **TestLinearPredictor.lr_params) self.train, self.test = tf_idf(self.train, self.test)
def post(self): text_dict = request.get_json(force=True) text_df = pd.DataFrame([text_dict['text']], columns = ['text'], index = [0]) pre.preprocess_text_cols(text_df) pre.create_new_features(text_df) global corpus word_features_train_df = pre.tf_idf(text_df, 'final_text', corpus) train, features = pre.return_final_df(text_df, word_features_train_df, target_series=None) try: prediction = int(model.predict(train[features])[0]) except ValueError as e: prediction = None return jsonify({"result": "Error"}) return jsonify({"result": prediction})
def post(self): text_dict = request.form.to_dict() text_df = pd.DataFrame([text_dict['text']], columns = ['text'], index = [0]) pre.preprocess_text_cols(text_df) pre.create_new_features(text_df) global corpus word_features_train_df = pre.tf_idf(text_df, 'final_text', corpus) train, features = pre.return_final_df(text_df, word_features_train_df, target_series=None) print(train[features]) try: prediction = int(model.predict(train[features])[0]) except ValueError as e: prediction = None headers = {'Content-Type': 'text/html'} if prediction == 0: return make_response(render_template('prediction_form.html', prediction = prediction, text = text_dict['text']), 200, headers) elif prediction == 1: return make_response(render_template('prediction_form.html', prediction = prediction, text = text_dict['text']), 200, headers) else: return make_response(render_template('prediction_form.html', prediction = 'Error', text = text_dict['text']), 200, headers)
def test_classifier(classifier, classifier_function_name, vocabulary, merged_labeled_data): """ A model for testing all classifiers """ print("\n ***", classifier, "***") # Original data testing print("\nORIGINAL DATA", ) print(" Number of articles:", len(merged_labeled_data)) print(" Number of total words:", sum([len(item[1]) for item in merged_labeled_data])) print(" Number of distinct words:", len(vocabulary)) # All words print(" ^ALL WORDS^") count_vectorizer_all = preprocessing.count_vectorizer_word_amount( merged_labeled_data, vocabulary, sum([1 for i in vocabulary.items()])) print( " Count vectorization, 10% test data:", average_classifier(classifier_function_name + "(test_data, 0.1)", count_vectorizer_all)) print( " Count vectorization, 20% test data:", average_classifier(classifier_function_name + "(test_data, 0.2)", count_vectorizer_all)) tf_idf_all = preprocessing.tf_idf(count_vectorizer_all) print( " TF-IDF vectorization, 10% test data:", average_classifier(classifier_function_name + "(test_data, 0.1)", tf_idf_all)) print( " TF-IDF vectorization, 20% test data:", average_classifier(classifier_function_name + "(test_data, 0.2)", tf_idf_all)) # Most popular 1000 words print(" ^FIRST 1000 WORDS^") count_vectorizer_1000 = preprocessing.count_vectorizer_word_amount( merged_labeled_data, vocabulary, 1000) print( " Count vectorization, 10% test data:", average_classifier(classifier_function_name + "(test_data, 0.1)", count_vectorizer_1000)) print( " Count vectorization, 20% test data:", average_classifier(classifier_function_name + "(test_data, 0.2)", count_vectorizer_1000)) tf_idf_1000 = preprocessing.tf_idf(count_vectorizer_1000) print( " TF-IDF vectorization, 10% test data:", average_classifier(classifier_function_name + "(test_data, 0.1)", tf_idf_1000)) print( " TF-IDF vectorization, 20% test data:", average_classifier(classifier_function_name + "(test_data, 0.2)", tf_idf_1000)) # Most popular 200 words print(" ^FIRST 200 WORDS^") count_vectorizer_200 = preprocessing.count_vectorizer_word_amount( merged_labeled_data, vocabulary, 200) print( " Count vectorization, 10% test data:", average_classifier(classifier_function_name + "(test_data, 0.1)", count_vectorizer_200)) print( " Count vectorization, 20% test data:", average_classifier(classifier_function_name + "(test_data, 0.2)", count_vectorizer_200)) tf_idf_200 = preprocessing.tf_idf(count_vectorizer_200) print( " TF-IDF vectorization, 10% test data:", average_classifier(classifier_function_name + "(test_data, 0.1)", tf_idf_200)) print( " TF-IDF vectorization, 20% test data:", average_classifier(classifier_function_name + "(test_data, 0.2)", tf_idf_200))