Beispiel #1
0
def run_feature_extraction_create_corpus(run_from_scratch, df_preprocessed):
    """ Run corpus building if run_from_scratch=True """
    if run_from_scratch:
        df_corpus = build_corpus(df_preprocessed)
        df_corpus.to_csv(
            str(get_project_root()) + "/data/extracted_features/corpus.csv")
        return df_corpus
    else:
        df_corpus = pd.read_csv(
            str(get_project_root()) + "/data/extracted_features/corpus.csv")
        return df_corpus
Beispiel #2
0
def run_preprocessing(run_from_scratch):
    """ Run data preprocessing if run_from_scratch=True """
    if run_from_scratch:
        # prepare corpus
        print("\nPreparing data ...")
        prepare_and_merge_datasets()
        df_preprocessed = pd.read_csv(str(get_project_root()) +
                                      "/data/preprocessed/dataset.csv",
                                      index_col=0)
        return df_preprocessed
    else:
        df_preprocessed = pd.read_csv(str(get_project_root()) +
                                      "/data/preprocessed/dataset.csv",
                                      index_col=0)
        return df_preprocessed
Beispiel #3
0
def run_feature_extraction(run_from_scratch, df_corpus):
    """ Run feature extraction if run_from_scratch=True """
    if run_from_scratch:
        print("\nExtracting features ...")
        df_extracted_features = FeatureExtractor(
            df_corpus).get_df_with_all_features()
        df_extracted_features = df_extracted_features.drop(
            ["original_content", "content", "tokens", "pos", "stems"], axis=1)
        df_extracted_features.to_csv(
            str(get_project_root()) +
            "/data/extracted_features/extracted_features.csv")
        return df_extracted_features
    else:
        df_extracted_features = pd.read_csv(
            str(get_project_root()) +
            "/data/extracted_features/extracted_features.csv")
        return df_extracted_features
Beispiel #4
0
 def train_fasttext(self, path_to_csv_dataset_file):
     """Train fasttext model based on dataset
     Parameters:
         - path_to_csv_dataset_file: relative path to the dataset file (expects a csv file)
     """
     model = fasttext.train_unsupervised(path_to_csv_dataset_file,
                                         model="skipgram")
     model.save_model(
         str(get_project_root()) + "/models/fasttext_model.bin")
Beispiel #5
0
def _prepare_hate_speech_dataset():
    df_dataset = _create_df_and_drop_columns(
        str(get_project_root())
        + "/data/original/hate-speech-dataset/annotations_metadata.csv",
        None,
        ["user_id", "subforum_id", "num_contexts"],
    )
    df_dataset = _filter_and_format_hate_speech(df_dataset)
    return df_dataset
Beispiel #6
0
def prepare_and_merge_datasets(include_offensive_language=False):
    """ Prepares and merges the datasets """
    dataset_csv = open(
        str(get_project_root()) + "/data/preprocessed/dataset.csv",
        encoding="utf-8",
        mode="w",
    )
    dataset_copy = open(
        str(get_project_root()) + "/analysis/dataset.csv", encoding="utf-8", mode="w"
    )

    if include_offensive_language:
        df_dataset = _prepare_hate_speech_and_offensive_language(True)
    else:
        df_first_dataset = _prepare_hate_speech_and_offensive_language()
        df_second_dataset = _prepare_hate_speech_dataset()
        df_dataset = pd.concat([df_first_dataset, df_second_dataset], ignore_index=True)

    dataset_csv.write(df_dataset.to_csv())
    dataset_csv.close()

    dataset_copy.write(df_dataset.to_csv())
    dataset_copy.close()
Beispiel #7
0
def _prepare_hate_speech_and_offensive_language(include_offensive_language=False):
    df_dataset = _create_df_and_drop_columns(
        str(get_project_root())
        + "/data/original/hate-speech-and-offensive-language/labeled_data.csv",
        0,
        ["count", "hate_speech", "offensive_language", "neither"],
    )

    if include_offensive_language:
        df_dataset.rename(columns={"tweet": "content"}, inplace=True)
    else:
        df_dataset = _filter_and_format_hate_speech_and_offensive_language(df_dataset)

    # df_dataset = _data_preparation(df_dataset)
    return df_dataset
    def visualize_special_characters(self, df):
        """Visualizes the number of special characters as bar plot
        Parameters:
            df: dataframe with the extracted features for special characters
        Return:
            stores barplots in analysis folder
        """
        df_hate_speech = df[df["class"] == 0]
        df_neutral_speech = df[df["class"] == 1]
        for character in self.list_of_special_characters:
            hate_bincount = self._calculate_bincount_of_special_character(
                df_hate_speech, character
            )
            neutral_bincount = self._calculate_bincount_of_special_character(
                df_neutral_speech, character
            )

            hate_bincount_summarized = self._summarize_bincount_data(hate_bincount)
            neutral_bincount_summarized = self._summarize_bincount_data(
                neutral_bincount
            )

            x = np.arange(11)
            plt.bar(x + 0.0, hate_bincount_summarized, color="r", width=0.2)
            plt.bar(x + 0.2, neutral_bincount_summarized, color="b", width=0.2)
            x_ticks = [str(x) for x in range(10)]
            x_ticks.append(">10")
            plt.xticks(x, x_ticks)
            plt.title(
                "Number of data instances with number of "
                + self.list_of_special_characters[character]
            )
            plt.xlabel("Number of " + character)
            plt.ylabel("Number of data instances")
            plt.legend(["hate speech", "neutral speech"])
            plt.savefig(
                str(get_project_root())
                + "/analysis/features/semantic/barchart_special_character_"
                + character
            )
Beispiel #9
0
    def visualize_word_embeddings_with_tsne(self, model):
        """Creates an TSNE model and plots it
        Parameters:
            - model: fasttext model
        """
        labels = []
        tokens = []

        for word in model.words:
            tokens.append(model[word])
            labels.append(word)

        tsne_model = TSNE(perplexity=40,
                          n_components=2,
                          init="pca",
                          n_iter=2500,
                          random_state=23)
        new_values = tsne_model.fit_transform(tokens)

        x = []
        y = []
        for value in new_values:
            x.append(value[0])
            y.append(value[1])

        plt.figure(figsize=(16, 16))
        for index, x in enumerate(x):
            plt.scatter(x[index], y[index])
            plt.annotate(
                labels[index],
                xy=(x[index], y[index]),
                xytext=(5, 2),
                textcoords="offset points",
                ha="right",
                va="bottom",
            )
        plt.savefig(
            str(get_project_root()) + "/analysis/fasttext_tsne_visualization")
        plt.show()
Beispiel #10
0
def _filter_and_format_hate_speech(df):
    df.loc[df["label"] == "hate", "label"] = 0
    df.loc[df["label"] == "noHate", "label"] = 1
    df.rename(columns={"label": "class"}, inplace=True)
    content = []
    for i, row in df.iterrows():
        if row["class"] == "idk/skip" or row["class"] == "relation":
            df = df.drop(index=i)
            continue
        file = open(
            str(get_project_root())
            + "/data/original/hate-speech-dataset/all_files/{}.txt".format(
                row["file_id"]
            ),
            encoding="utf-8",
            mode="r",
        )
        content.append(file.read())
        file.close()
    df["content"] = content

    df.drop(["file_id"], axis=1, inplace=True)
    return df
Beispiel #11
0
 def extract_features(self, df, visualize=False):
     """Extract vector representation of the data instance based on word embeddings trained by fasttext
     Parameters:
         df with the column containing the tokens of each data instance
     Return:
         passed df with new feature column containing a vector (mean of the word embeddings of all tokens)
     """
     model = fasttext.load_model(
         str(get_project_root()) + "/models/fasttext_model.bin")
     df_fasttext_vector = pd.DataFrame()
     df_fasttext_vector["fasttext_word_embeddings_vector"] = df[
         "tokens"].apply(
             lambda cell: self.get_vector_of_data_instance(model, cell))
     df_fasttext_vector = pd.DataFrame(
         df_fasttext_vector["fasttext_word_embeddings_vector"].values.
         tolist())
     titles = [
         "fasttext_word_embeddings_vector_" + str(i) for i in range(100)
     ]
     df_fasttext_vector.columns = titles
     df = pd.concat([df_fasttext_vector, df], axis=1)
     if visualize:
         self.visualize_word_embeddings_with_tsne(model)
     return df
Beispiel #12
0
 def __init__(self):
     self.train_fasttext(
         str(get_project_root()) + "/data/preprocessed/dataset.csv")
Beispiel #13
0
    df_preprocessed_data = run_preprocessing(preprocessing)
    df_data_corpus = run_feature_extraction_create_corpus(
        corpus, df_preprocessed_data)
    df_data_extracted_features = run_feature_extraction(
        feature_extraction, df_data_corpus)

    # unchanged dataset
    raw_text_features = df_preprocessed_data["content"]
    raw_text_labels = df_preprocessed_data["class"]
    extracted_features = df_data_extracted_features.loc[:,
                                                        df_data_extracted_features
                                                        .columns != "class"]
    labels = df_data_extracted_features["class"]

    # do balancing, i.e. over- and undersampling
    input_data = InputData(raw_text_features, raw_text_labels,
                           extracted_features, labels)

    # feature importances
    print("\nFeature importances ...")
    feature_importance = FeatureImportance(extracted_features, labels,
                                           extracted_features.columns.values)
    feature_importance.get_importance_scores()

    # run classifiers
    print("\nRunning classifiers ...")
    classifier_executor = ClassifierExecutor(input_data.get_datasets())
    df_results = classifier_executor.get_results()
    df_results.to_csv(str(get_project_root()) + "/results/results.csv")
 def test_get_project_root(self):
     project_root_path = get_project_root()
     path_components = str(project_root_path).split("/")
     self.assertEqual("src", path_components[len(path_components) - 1])
     self.assertEqual("HateSpeechDetection",
                      path_components[len(path_components) - 2])