def TFIDF_pre_proc(original_corpus, suspicious_corpus): pre_processed_files = [] sus = [] orig = [] for text in original_corpus: original = Pre_Processing.lower_case(text) original = Pre_Processing.remove_punctuation(original) original = Pre_Processing.clean_text(original) original = Pre_Processing.tokenization(original) original = Pre_Processing.remove_stopwords(original) original = Pre_Processing.lemmatize_words(original) orig.append(original) pre_processed_files.append(orig) for text in suspicious_corpus: suspicious = Pre_Processing.lower_case(text) suspicious = Pre_Processing.remove_punctuation(suspicious) suspicious = Pre_Processing.clean_text(suspicious) suspicious = Pre_Processing.tokenization(suspicious) suspicious = Pre_Processing.remove_stopwords(suspicious) suspicious = Pre_Processing.lemmatize_words(suspicious) sus.append(suspicious) pre_processed_files.append(sus) print("TFIDF Pre-Processing Complete") return pre_processed_files
def NGRAM_pre_proc(suspicious_corpus): pre_processed_files = [] for text in suspicious_corpus: suspicious = Pre_Processing.lower_case(text) suspicious = Pre_Processing.remove_punctuation(suspicious) suspicious = Pre_Processing.clean_text(suspicious) suspicious = Pre_Processing.tokenization(suspicious) suspicious = Pre_Processing.remove_stopwords(suspicious) suspicious = Pre_Processing.lemmatize_words(suspicious) pre_processed_files.append(suspicious) print("NGram Overlap Pre-Processing Complete") return pre_processed_files