Esempio n. 1
0
def run(df) -> DataFrame:
    log("Preprocessing dataframe...")

    start_time = time.time()
    df = preprocess(df)
    logDF(df)
    end_time = time.time()

    log("Pre-process Finished! (" + str(end_time - start_time) + " seconds)")
    return df
Esempio n. 2
0
def run(classifier, X, y):
    log("Running " + classifier + " Classifier...")

    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

    model = make_pipeline(TfidfVectorizer(), get_classifier(classifier))
    # model = make_pipeline(TfidfVectorizer(), SMOTE(random_state=42), get_classifier(classifier))

    model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)

    calc_and_print_metrics(y_test, y_predicted)
    confusion_matrix(y_test, y_predicted, classifier)
Esempio n. 3
0
def preprocess(df) -> DataFrame:
    df.tweet = df.tweet.str.lower()

    log("\t--removing user tags")
    df.tweet = df.tweet.apply(clean_user_tags)

    log("\t--removing links")
    df.tweet = df.tweet.apply(remove_links)

    log("\t--decontracting words")
    df.tweet = df.tweet.apply(decontracted)

    log("\t--replacing emojizzz with description")
    df.tweet = df.tweet.apply(replace_emoji_with_description)

    log("\t--cleaning punctuation")
    df.tweet = df.tweet.apply(clean_punc)

    log("\t--keeping just text")
    df.tweet = df.tweet.apply(keep_alpha)

    log("\t--removing stopwords")
    df.tweet = df.tweet.apply(remove_stop_words)

    log("\t--merging multiple spaces")
    df.tweet = df.tweet.apply(merge_multiple_character_occurrences)

    log("\t--stemming tweets")
    df.tweet = df.tweet.apply(stem_tweets)

    return df
Esempio n. 4
0
import HateTweets.IO.InputManager as InputManager
import HateTweets.Classification as classifiers
import HateTweets.preprocess.PreprocessUtil as PreProcess
from HateTweets.IO.OutputUtil import log, logDF, save2csv, plot

log("### HATE TWEETS CLASSIFICATION ###")

df = InputManager.load_data()
df = PreProcess.run(df)

plot(df, 'class')
# save2csv(df, "preprocessed")

classifiers.classify(df)

log("##################################")
Esempio n. 5
0
def load_data() -> pd.DataFrame:
    log("Reading " + DATASET_FILENAME)
    df = pd.read_csv(DATASET_DIR + DATASET_FILENAME, index_col=0)
    logDF(df.head())
    # logDF(df.info())
    return df