Python Pre_Processingの例

プログラミング言語: Python

名前空間/パッケージ名: TextPreprocessing

クラス/型: Pre_Processing

hotexamples.comのコード掲載数: 5

Python Pre_Processing - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのTextPreprocessing.Pre_Processingの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

lower_case(6)

clean_text(5)

remove_punctuation(5)

lemmatize_words(2)

remove_stopwords(2)

tokenization(2)

コード例 #1

ファイルを表示

ファイル: Internal_Main.py プロジェクト: pombredanne/Django_Project

def WORDNET_pre_proc(suspicious_corpus):
    pre_processed_files = []
    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        Pre_Processing.remove_punctuation(suspicious)
        Pre_Processing.clean_text(suspicious)
        pre_processed_files.append(suspicious)
    print("WordNet Pre-Processing Complete")
    return pre_processed_files

コード例 #2

ファイルを表示

ファイル: Internal_Main.py プロジェクト: pombredanne/Django_Project

def TFIDF_pre_proc(suspicious_corpus):
    pre_processed_files = []
    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        suspicious = Pre_Processing.remove_punctuation(suspicious)
        suspicious = Pre_Processing.clean_text(suspicious)
        suspicious = Pre_Processing.tokenization(suspicious)
        suspicious = Pre_Processing.remove_stopwords(suspicious)
        suspicious = Pre_Processing.lemmatize_words(suspicious)
        pre_processed_files.append(suspicious)
    print("TFIDF Pre-Processing Complete")
    return pre_processed_files

コード例 #3

ファイルを表示

ファイル: Source_Main.py プロジェクト: pombredanne/Django_Project

def LCS_pre_proc(original_corpora, suspicious_corpus):
    pre_processed_files = []
    sus = []
    original = Pre_Processing.lower_case(original_corpora)
    original = Pre_Processing.remove_punctuation(original)
    original = Pre_Processing.clean_text(original)
    pre_processed_files.append(original)

    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        suspicious = Pre_Processing.remove_punctuation(suspicious)
        suspicious = Pre_Processing.clean_text(suspicious)
        sus.append(suspicious)
    pre_processed_files.append(sus)
    print("LCS Pre-Processing Complete")
    return pre_processed_files

コード例 #4

ファイルを表示

filenames = os.listdir(
    "C:/Users/Chris/Documents/UoB_MSc_Computer_Science/MSc_Dissertation/cjh748/scikit-machine-learning/train_test_corpus"
)
files = []
array_data = []
array_label = []
for file in filenames:
    with codecs.open(
            "C:/Users/Chris/Documents/UoB_MSc_Computer_Science/"
            "MSc_Dissertation/cjh748/scikit-machine-learning/train_test_corpus/"
            + file,
            "r",
            encoding='utf-8',
            errors='ignore') as file_data:
        open_file = file_data.read()
        open_file = Pre_Processing.lower_case(open_file)
        open_file = Pre_Processing.remove_punctuation(open_file)
        open_file = Pre_Processing.clean_text(open_file)
        files.append(open_file)

for file in files:
    if 'inheritance' in file:
        array_data.append(file)
        array_label.append('Inheritance (object-oriented programming)')
    elif 'pagerank' in file:
        array_data.append(file)
        array_label.append('PageRank')
    elif 'vector space model' in file:
        array_data.append(file)
        array_label.append('Vector Space Model')
    elif 'bayes' in file:

コード例 #5

ファイルを表示

def NGRAM_pre_proc(original_corpus, suspicious_corpus):
    pre_processed_files = []
    sus = []
    orig = []
    for text in original_corpus:
        original = Pre_Processing.lower_case(text)
        original = Pre_Processing.remove_punctuation(original)
        original = Pre_Processing.clean_text(original)
        original = Pre_Processing.tokenization(original)
        original = Pre_Processing.remove_stopwords(original)
        original = Pre_Processing.lemmatize_words(original)
        orig.append(original)
    pre_processed_files.append(orig)

    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        suspicious = Pre_Processing.remove_punctuation(suspicious)
        suspicious = Pre_Processing.clean_text(suspicious)
        suspicious = Pre_Processing.tokenization(suspicious)
        suspicious = Pre_Processing.remove_stopwords(suspicious)
        suspicious = Pre_Processing.lemmatize_words(suspicious)
        sus.append(suspicious)
    pre_processed_files.append(sus)
    print("NGram Overlap Pre-Processing Complete")
    return pre_processed_files