Python PreProcess.clean_html Examples

Programming Language: Python

Namespace/Package Name: preprocess

Class/Type: PreProcess

Method/Function: clean_html

Examples at hotexamples.com: 3

Python PreProcess.clean_html - 3 examples found. These are the top rated real world Python examples of preprocess.PreProcess.clean_html extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PreProcess(30)

get_plate_like_objects(3)

lemmatization(3)

load_train_data(3)

clean_html(3)

load_test_data(2)

distance_in_meters(2)

getWordList(2)

id2chinese(1)

lemmatizeWordList(1)

balance_data(1)

labels_to_one_hot(1)

id2english(1)

get_precincts_wards_geometry_data(1)

get_refuse_routes_data(1)

complexity(1)

clean_data(1)

__init__(1)

generate_topic_concepts(1)

english2id(1)

copy(1)

get_CDBG_geometry_data(1)

Example #1

Show file

File: app.py Project: gaybro8777/document-classification

def submit():
    if request.form['text_input'] == "" or len(
            request.form['text_input']) < 10:
        return "Please provide input large enough, Classifier can understand :)"

    # todo: change column name to be dynamically taken from training file
    test_data = pd.DataFrame([request.form['text_input']],
                             columns=['Document'])
    session_id = request.cookies['session']
    path = os.path.join(app.config['UPLOAD_FOLDER'], session_id)
    trained_classifier = [i for i in os.listdir(path) if '.pkl' in i]
    vectorizer = os.path.join(path, 'tfidf_vectorizer.pk')
    tfidf_transformer = joblib.load(vectorizer)
    pre_processor = PreProcess(test_data, column_name='Document')
    test_data = pre_processor.clean_html()
    test_data = pre_processor.remove_non_ascii()
    test_data = pre_processor.remove_spaces()
    test_data = pre_processor.remove_punctuation()
    test_data = pre_processor.stemming()
    test_data = pre_processor.lemmatization()
    test_data = pre_processor.stop_words()
    test_data1 = tfidf_transformer.transform(test_data.Document)
    result = {}
    for clf in trained_classifier:
        model = joblib.load(os.path.join(path, clf))
        print(clf, model.predict(test_data1)[0])
        classifier_name = clf.split('/')[-1].split('.')[0]
        result[classifier_name] = model.predict(test_data1)[0]
        print(result)
    return render_template('results.html', result=result)

Example #2

Show file

File: document_classification.py Project: ashhadulislam/document-classification

def main():
    data = get_data(
        '/Users/aditya1/Documents/Document_Classification/bbc-dataset')

    ###############################################################################
    # Data Pre-processing steps
    ###############################################################################
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    pre_processor_operations = ['clean_html']
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()

    ###############################################################################
    # Feature extraction
    ###############################################################################

    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    # print(train_x.shape, train_y.shape)
    # print(test_x.shape, test_y.shape)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    joblib.dump(tfidf_transformer, 'vectorizer.pkl')
    test_vectors = tfidf_transformer.transform(test_x)
    print(data.head())

    ###############################################################################
    # Perform classification with SVM, kernel=linear
    model1 = svm.SVC(kernel='linear')
    model1.fit(train_vectors, train_y)
    joblib.dump(model1, 'SVM.pkl')
    y_pred_class = model1.predict(test_vectors)
    print(metrics.accuracy_score(test_y, y_pred_class))
    print("Prediction score for classifier %s:\n%s\n" %
          (model1, metrics.accuracy_score(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model1, metrics.classification_report(test_y, y_pred_class)))

    model2 = MultinomialNB()
    model2.fit(train_vectors, train_y)
    joblib.dump(model2, 'MultinomialNB.pkl')
    y_pred_class = model2.predict(test_vectors)
    print("Accuracy score:", metrics.accuracy_score(test_y, y_pred_class))
    print("Confusion Matrix for classifier %s:\n%s\n" %
          (model2, metrics.confusion_matrix(test_y, y_pred_class)))
    print("Classification report for classifier %s:\n%s\n" %
          (model2, metrics.classification_report(test_y, y_pred_class)))

Example #3

Show file

File: app.py Project: gaybro8777/document-classification

def read_process_data(path, files_path):
    data = pd.read_csv(path)
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()
    train_x, test_x, train_y, test_y = train_test_split(data.Document,
                                                        data.Category,
                                                        test_size=0.20)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    vectorizer_path = os.path.join(files_path, 'tfidf_vectorizer.pk')
    joblib.dump(tfidf_transformer, vectorizer_path)
    return train_vectors, train_y