def setUp(self):
     self.train = pd.read_csv(train_file,
                              nrows=TestLinearPredictor.number_of_rows)
     self.test = pd.read_csv(test_file,
                             nrows=TestLinearPredictor.number_of_rows)
     self.y_train = {tag: self.train[tag].values for tag in utils.TAGS}
     self.logistic_predictor = LogisticPredictor(
         **TestLinearPredictor.lr_params)
     self.train, self.test = tf_idf(self.train, self.test)
Example #2
0
	def post(self):
		text_dict = request.get_json(force=True)
		text_df = pd.DataFrame([text_dict['text']], columns = ['text'], index = [0])
		pre.preprocess_text_cols(text_df)
		pre.create_new_features(text_df)

		global corpus
		word_features_train_df = pre.tf_idf(text_df, 'final_text', corpus)
		train, features = pre.return_final_df(text_df, word_features_train_df, target_series=None)
		try:
			prediction = int(model.predict(train[features])[0])
		except ValueError as e:
			prediction = None
			return jsonify({"result": "Error"})

		return jsonify({"result": prediction})
Example #3
0
	def post(self):
		text_dict = request.form.to_dict()
		text_df = pd.DataFrame([text_dict['text']], columns = ['text'], index = [0])
		pre.preprocess_text_cols(text_df)
		pre.create_new_features(text_df)

		global corpus
		word_features_train_df = pre.tf_idf(text_df, 'final_text', corpus)
		train, features = pre.return_final_df(text_df, word_features_train_df, target_series=None)
		print(train[features])
		try:
			prediction = int(model.predict(train[features])[0])
		except ValueError as e:
			prediction = None
		headers = {'Content-Type': 'text/html'}

		if prediction == 0:
			return make_response(render_template('prediction_form.html', prediction = prediction, text = text_dict['text']), 200, headers)
		elif prediction == 1:
			return make_response(render_template('prediction_form.html', prediction = prediction, text = text_dict['text']), 200, headers)
		else:
			return make_response(render_template('prediction_form.html', prediction = 'Error', text = text_dict['text']), 200, headers)
Example #4
0
def test_classifier(classifier, classifier_function_name, vocabulary,
                    merged_labeled_data):
    """ A model for testing all classifiers """
    print("\n   ***", classifier, "***")

    # Original data testing
    print("\nORIGINAL DATA", )
    print("  Number of articles:", len(merged_labeled_data))
    print("  Number of total words:",
          sum([len(item[1]) for item in merged_labeled_data]))
    print("  Number of distinct words:", len(vocabulary))
    # All words
    print("   ^ALL WORDS^")
    count_vectorizer_all = preprocessing.count_vectorizer_word_amount(
        merged_labeled_data, vocabulary, sum([1 for i in vocabulary.items()]))
    print(
        "    Count vectorization, 10% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.1)",
                           count_vectorizer_all))
    print(
        "    Count vectorization, 20% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.2)",
                           count_vectorizer_all))
    tf_idf_all = preprocessing.tf_idf(count_vectorizer_all)
    print(
        "    TF-IDF vectorization, 10% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.1)",
                           tf_idf_all))
    print(
        "    TF-IDF vectorization, 20% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.2)",
                           tf_idf_all))
    # Most popular 1000 words
    print("   ^FIRST 1000 WORDS^")
    count_vectorizer_1000 = preprocessing.count_vectorizer_word_amount(
        merged_labeled_data, vocabulary, 1000)
    print(
        "    Count vectorization, 10% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.1)",
                           count_vectorizer_1000))
    print(
        "    Count vectorization, 20% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.2)",
                           count_vectorizer_1000))
    tf_idf_1000 = preprocessing.tf_idf(count_vectorizer_1000)
    print(
        "    TF-IDF vectorization, 10% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.1)",
                           tf_idf_1000))
    print(
        "    TF-IDF vectorization, 20% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.2)",
                           tf_idf_1000))
    # Most popular 200 words
    print("   ^FIRST 200 WORDS^")
    count_vectorizer_200 = preprocessing.count_vectorizer_word_amount(
        merged_labeled_data, vocabulary, 200)
    print(
        "    Count vectorization, 10% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.1)",
                           count_vectorizer_200))
    print(
        "    Count vectorization, 20% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.2)",
                           count_vectorizer_200))
    tf_idf_200 = preprocessing.tf_idf(count_vectorizer_200)
    print(
        "    TF-IDF vectorization, 10% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.1)",
                           tf_idf_200))
    print(
        "    TF-IDF vectorization, 20% test data:",
        average_classifier(classifier_function_name + "(test_data, 0.2)",
                           tf_idf_200))