コード例 #1
0
def WORDNET_pre_proc(suspicious_corpus):
    pre_processed_files = []
    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        Pre_Processing.remove_punctuation(suspicious)
        Pre_Processing.clean_text(suspicious)
        pre_processed_files.append(suspicious)
    print("WordNet Pre-Processing Complete")
    return pre_processed_files
コード例 #2
0
def TFIDF_pre_proc(suspicious_corpus):
    pre_processed_files = []
    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        suspicious = Pre_Processing.remove_punctuation(suspicious)
        suspicious = Pre_Processing.clean_text(suspicious)
        suspicious = Pre_Processing.tokenization(suspicious)
        suspicious = Pre_Processing.remove_stopwords(suspicious)
        suspicious = Pre_Processing.lemmatize_words(suspicious)
        pre_processed_files.append(suspicious)
    print("TFIDF Pre-Processing Complete")
    return pre_processed_files
コード例 #3
0
def LCS_pre_proc(original_corpora, suspicious_corpus):
    pre_processed_files = []
    sus = []
    original = Pre_Processing.lower_case(original_corpora)
    original = Pre_Processing.remove_punctuation(original)
    original = Pre_Processing.clean_text(original)
    pre_processed_files.append(original)

    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        suspicious = Pre_Processing.remove_punctuation(suspicious)
        suspicious = Pre_Processing.clean_text(suspicious)
        sus.append(suspicious)
    pre_processed_files.append(sus)
    print("LCS Pre-Processing Complete")
    return pre_processed_files
コード例 #4
0
filenames = os.listdir(
    "C:/Users/Chris/Documents/UoB_MSc_Computer_Science/MSc_Dissertation/cjh748/scikit-machine-learning/train_test_corpus"
)
files = []
array_data = []
array_label = []
for file in filenames:
    with codecs.open(
            "C:/Users/Chris/Documents/UoB_MSc_Computer_Science/"
            "MSc_Dissertation/cjh748/scikit-machine-learning/train_test_corpus/"
            + file,
            "r",
            encoding='utf-8',
            errors='ignore') as file_data:
        open_file = file_data.read()
        open_file = Pre_Processing.lower_case(open_file)
        open_file = Pre_Processing.remove_punctuation(open_file)
        open_file = Pre_Processing.clean_text(open_file)
        files.append(open_file)

for file in files:
    if 'inheritance' in file:
        array_data.append(file)
        array_label.append('Inheritance (object-oriented programming)')
    elif 'pagerank' in file:
        array_data.append(file)
        array_label.append('PageRank')
    elif 'vector space model' in file:
        array_data.append(file)
        array_label.append('Vector Space Model')
    elif 'bayes' in file:
コード例 #5
0
def NGRAM_pre_proc(original_corpus, suspicious_corpus):
    pre_processed_files = []
    sus = []
    orig = []
    for text in original_corpus:
        original = Pre_Processing.lower_case(text)
        original = Pre_Processing.remove_punctuation(original)
        original = Pre_Processing.clean_text(original)
        original = Pre_Processing.tokenization(original)
        original = Pre_Processing.remove_stopwords(original)
        original = Pre_Processing.lemmatize_words(original)
        orig.append(original)
    pre_processed_files.append(orig)

    for text in suspicious_corpus:
        suspicious = Pre_Processing.lower_case(text)
        suspicious = Pre_Processing.remove_punctuation(suspicious)
        suspicious = Pre_Processing.clean_text(suspicious)
        suspicious = Pre_Processing.tokenization(suspicious)
        suspicious = Pre_Processing.remove_stopwords(suspicious)
        suspicious = Pre_Processing.lemmatize_words(suspicious)
        sus.append(suspicious)
    pre_processed_files.append(sus)
    print("NGram Overlap Pre-Processing Complete")
    return pre_processed_files