Beispiel #1
0
    def label(self, start: int = 0, end: int = None):
        if end is None:
            end = len(self.dataframe)

        dataframe = self.dataframe.iloc[start:end]

        for index, row in dataframe.iterrows():
            title = row["title"]
            text = row["original_text"]
            persons = list(filter(lambda person: len(person) > 2, row["persons"]))

            if len(persons) > 0:
                personPattern = re.compile("|".join(persons), re.IGNORECASE)
                title = personPattern.sub("<Person>", title)
                text = personPattern.sub("<Person>", text)

            title = self.partyPattern.sub("<Partei>", title)
            text = self.partyPattern.sub("<Partei>", text)

            mediaPattern = re.compile("|".join(["bild", "bildplus", "taz", "tagesschau"]), re.IGNORECASE)
            title = mediaPattern.sub("<Zeitung>", title)
            text = mediaPattern.sub("<Zeitung>", text)

            print("================================================")
            print(title)
            print("++++++++++++++++++++++++++++++++++++++++++++++++")
            print(text)
            print("================================================")

            self.get_polarity_input(dataframe, index)
            self.get_subjectivity_input(dataframe, index)

            Writer.write_dataframe(dataframe, "labeled_paragraphs")
Beispiel #2
0
    def train_threshold(self) -> float:
        """
        Train the threshold with labeled data.

        :return The best threshold.
        """
        threshold: float = 0
        best_threshold: float = 0
        best_score: float = 0

        thresholds: List[float] = []
        f1_scores: List[Tuple[float, float, float, float]] = []

        self.tfidf_sentiment.get_context_polarity(8)
        self.tfidf_sentiment.calculate_sentiment_score(overwrite=True)

        # Iterate over different thresholds, increase with every loop
        while threshold <= 0.005:
            self.tfidf_sentiment.map_sentiment(threshold=threshold, overwrite=True)

            # Optimize over the sum of all f1 scores for SentiWs
            f1_sentiws, _, _ = self.f1_score(training=True)
            f1_sum = f1_sentiws[0] + f1_sentiws[1] + f1_sentiws[2]

            thresholds.append(threshold)
            f1_scores.append((f1_sum, f1_sentiws[0], f1_sentiws[1], f1_sentiws[2]))

            # Replace best threshold if current one is better
            if f1_sum > best_score:
                best_score = f1_sum
                best_threshold = threshold

            threshold += 0.0000001

        # Visualize the training
        self.visualize_threshold(thresholds, f1_scores, best_threshold, 0.005)

        # Adjust the sentiment with best threshold
        self.tfidf_sentiment.map_sentiment(threshold=best_threshold, overwrite=True)
        Writer.write_dataframe(self.dataframe, "labeled_paragraphs")
        return best_threshold
Beispiel #3
0
            )
            sys.exit()

        comparison = Comparison(labeled_file)

        # Train the score threshold
        optimal_threshold = comparison.train_threshold()
        print("Optimal threshold: {}\n".format(optimal_threshold))

        # Train the window and the score threshold
        optimal_context_thresholds = comparison.train_context_thresholds()
        print("Optimal context thresholds: {} (window), {} (score)\n".format(
            optimal_context_thresholds[0], optimal_context_thresholds[1]))

    # Save paragraphs to disk
    Writer.write_dataframe(df_paragraphs, "paragraphs")

    # Show GUI
    if args.show_gui:
        gui = SentimentGUI(df_paragraphs)
        gui.show_gui()

    # Compare labeled data with results
    if args.compare:
        labeled_file = Path("src/output/labeled_paragraphs.json")

        if not labeled_file.exists():
            print(
                'You have to provide a labeled file "labeled_paragraphs.json" for comparison in the output folder'
            )
            sys.exit()
Beispiel #4
0
    def train_context_thresholds(self) -> Tuple[float, float]:
        """
        Train the context threshold (SentiWs with context polarity) with labeled data.

        :return The best thresholds for window size and score.
        """
        window_threshold: int = 0
        best_window_threshold: int = 0
        best_score_threshold: float = 0
        best_score: float = 0

        thresholds: List[float] = []
        f1_scores: List[Tuple] = []

        # Iterate over different window thresholds, increase with every loop
        while window_threshold <= 35:
            self.tfidf_sentiment.get_context_polarity(window_threshold)
            self.tfidf_sentiment.calculate_sentiment_score(overwrite=True)

            score_threshold: float = 0

            # Save best temp scores for visualization
            best_temp_score_f1_sum: float = 0
            best_temp_score_f1_scores: Tuple = ()

            # Iterate over different score thresholds, increase with every loop
            while score_threshold < 0.001:
                self.tfidf_sentiment.map_sentiment(overwrite=True, threshold=score_threshold)
                self.dataframe = self.tfidf_sentiment.df_paragraphs

                # Optimize over the sum of all f1 scores for context sentiment
                _, _, f1_context = self.f1_score(training=True)
                f1_sum = f1_context[0] + f1_context[1] + f1_context[2]

                # Replace best temp thresholds for visualization if current ones are better
                if f1_sum > best_temp_score_f1_sum:
                    best_temp_score_f1_sum = f1_sum
                    best_temp_score_f1_scores = (f1_sum, f1_context[0], f1_context[1], f1_context[2])

                # Replace best thresholds if current ones are better
                if f1_sum > best_score:
                    best_score = f1_sum
                    best_window_threshold = window_threshold
                    best_score_threshold = score_threshold

                score_threshold += 0.00001

            thresholds.append(window_threshold)
            f1_scores.append(best_temp_score_f1_scores)

            window_threshold += 1

        # Visualize the training
        self.visualize_threshold(thresholds, f1_scores, best_window_threshold, 35)

        # Adjust the sentiment with best thresholds
        self.tfidf_sentiment.get_context_polarity(best_window_threshold)
        self.tfidf_sentiment.calculate_sentiment_score(overwrite=True)
        self.tfidf_sentiment.map_sentiment(overwrite=True, threshold=best_score_threshold)
        Writer.write_dataframe(self.dataframe, "labeled_paragraphs")
        return best_window_threshold, best_score_threshold