def run(df) -> DataFrame: log("Preprocessing dataframe...") start_time = time.time() df = preprocess(df) logDF(df) end_time = time.time() log("Pre-process Finished! (" + str(end_time - start_time) + " seconds)") return df
def run(classifier, X, y): log("Running " + classifier + " Classifier...") x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0) model = make_pipeline(TfidfVectorizer(), get_classifier(classifier)) # model = make_pipeline(TfidfVectorizer(), SMOTE(random_state=42), get_classifier(classifier)) model.fit(x_train, y_train) y_predicted = model.predict(x_test) calc_and_print_metrics(y_test, y_predicted) confusion_matrix(y_test, y_predicted, classifier)
def preprocess(df) -> DataFrame: df.tweet = df.tweet.str.lower() log("\t--removing user tags") df.tweet = df.tweet.apply(clean_user_tags) log("\t--removing links") df.tweet = df.tweet.apply(remove_links) log("\t--decontracting words") df.tweet = df.tweet.apply(decontracted) log("\t--replacing emojizzz with description") df.tweet = df.tweet.apply(replace_emoji_with_description) log("\t--cleaning punctuation") df.tweet = df.tweet.apply(clean_punc) log("\t--keeping just text") df.tweet = df.tweet.apply(keep_alpha) log("\t--removing stopwords") df.tweet = df.tweet.apply(remove_stop_words) log("\t--merging multiple spaces") df.tweet = df.tweet.apply(merge_multiple_character_occurrences) log("\t--stemming tweets") df.tweet = df.tweet.apply(stem_tweets) return df
import HateTweets.IO.InputManager as InputManager import HateTweets.Classification as classifiers import HateTweets.preprocess.PreprocessUtil as PreProcess from HateTweets.IO.OutputUtil import log, logDF, save2csv, plot log("### HATE TWEETS CLASSIFICATION ###") df = InputManager.load_data() df = PreProcess.run(df) plot(df, 'class') # save2csv(df, "preprocessed") classifiers.classify(df) log("##################################")
def load_data() -> pd.DataFrame: log("Reading " + DATASET_FILENAME) df = pd.read_csv(DATASET_DIR + DATASET_FILENAME, index_col=0) logDF(df.head()) # logDF(df.info()) return df