Esempio n. 1
0
def submit():
    if request.form['text_input'] == "" or len(
            request.form['text_input']) < 10:
        return "Please provide input large enough, Classifier can understand :)"

    # todo: change column name to be dynamically taken from training file
    test_data = pd.DataFrame([request.form['text_input']],
                             columns=['Document'])
    session_id = request.cookies['session']
    path = os.path.join(app.config['UPLOAD_FOLDER'], session_id)
    trained_classifier = [i for i in os.listdir(path) if '.pkl' in i]
    vectorizer = os.path.join(path, 'tfidf_vectorizer.pk')
    tfidf_transformer = joblib.load(vectorizer)
    pre_processor = PreProcess(test_data, column_name='Document')
    test_data = pre_processor.clean_html()
    test_data = pre_processor.remove_non_ascii()
    test_data = pre_processor.remove_spaces()
    test_data = pre_processor.remove_punctuation()
    test_data = pre_processor.stemming()
    test_data = pre_processor.lemmatization()
    test_data = pre_processor.stop_words()
    test_data1 = tfidf_transformer.transform(test_data.Document)
    result = {}
    for clf in trained_classifier:
        model = joblib.load(os.path.join(path, clf))
        print(clf, model.predict(test_data1)[0])
        classifier_name = clf.split('/')[-1].split('.')[0]
        result[classifier_name] = model.predict(test_data1)[0]
        print(result)
    return render_template('results.html', result=result)
def main():
    data = get_data(
        '/Users/aditya1/Documents/Document_Classification/bbc-dataset')

    ###############################################################################
    # Data Pre-processing steps
    ###############################################################################
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    pre_processor_operations = ['clean_html']
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()

    ###############################################################################
    # Feature extraction
    ###############################################################################

    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    # print(train_x.shape, train_y.shape)
    # print(test_x.shape, test_y.shape)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    joblib.dump(tfidf_transformer, 'vectorizer.pkl')
    test_vectors = tfidf_transformer.transform(test_x)
    print(data.head())

    ###############################################################################
    # Perform classification with SVM, kernel=linear
    model1 = svm.SVC(kernel='linear')
    model1.fit(train_vectors, train_y)
    joblib.dump(model1, 'SVM.pkl')
    y_pred_class = model1.predict(test_vectors)
    print(metrics.accuracy_score(test_y, y_pred_class))
    print("Prediction score for classifier %s:\n%s\n" %
          (model1, metrics.accuracy_score(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model1, metrics.classification_report(test_y, y_pred_class)))

    model2 = MultinomialNB()
    model2.fit(train_vectors, train_y)
    joblib.dump(model2, 'MultinomialNB.pkl')
    y_pred_class = model2.predict(test_vectors)
    print("Accuracy score:", metrics.accuracy_score(test_y, y_pred_class))
    print("Confusion Matrix for classifier %s:\n%s\n" %
          (model2, metrics.confusion_matrix(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model2, metrics.classification_report(test_y, y_pred_class)))
Esempio n. 3
0
def read_process_data(path, files_path):
    data = pd.read_csv(path)
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()
    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    vectorizer_path = os.path.join(files_path, 'tfidf_vectorizer.pk')
    joblib.dump(tfidf_transformer, vectorizer_path)
    return train_vectors, train_y