コード例 #1
0
def analyze_text(text):
    nlp = spacy.load('de_core_news_sm')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    doc = nlp(text)

    sentences = []
    for sent in doc.sents:
        sentences.append(sent)

    sentiws = spaCySentiWS(sentiws_path='data/sentiws/')
    nlp.add_pipe(sentiws)

    results = []

    for sentence in sentences:
        doc = nlp(sentence.text)
        filtered_doc = []

        for word in doc:
            if not word.is_stop:
                filtered_doc.append(word)

        for token in filtered_doc:
            if token._.sentiws:
                if token._.sentiws < -0.3:
                    results.append({
                        "text": sentence.text,
                        "danger": [token.text],
                        "danger_value": token._.sentiws,
                        "danger_obj": token.pos_
                    })

    return results
コード例 #2
0
 def __init__(self,
              wordvecs=False,
              sentiws_path='data/sentiws/',
              polarity_modifiers_path='data/polarity_modifiers.pickle'):
     """
     Parameters:
     - wordvecs: True or False; use de_core_news_sm
                 or de_core_news_md german spacy model
     - sentiws_path: path of your sentiws data
     - polarity_modifiers_path: path of your polarity
       modifier dict as pickle
     """
     # loading german spacy model
     if wordvecs:
         self.nlp = spacy.load('de_core_news_md')
     else:
         self.nlp = spacy.load('de_core_news_sm')
     # integrating SentiWS-Corpus as token attribute
     sentiws = spaCySentiWS(sentiws_path=sentiws_path)
     self.nlp.add_pipe(sentiws)
     self.doc = None
     self.modifiers = pickle.load(open(polarity_modifiers_path, 'rb'))
     if not Token.has_extension("modified"):
         Token.set_extension("modified", getter=self.modify_polarity)
     if not Token.has_extension("negated"):
         Token.set_extension("negated", getter=self.negate)
コード例 #3
0
def content_analysis(directory,
                     party="all",
                     sample=None,
                     window_size=25,
                     debug=False):

    if directory != 'test':
        Path(f"res_ca/{directory}/").mkdir(parents=False, exist_ok=False)

    doc_labels = load_data(party)
    if type(sample) == int:
        doc_labels = random.sample(doc_labels, sample)
        text = None
    elif type(sample) == str:
        doc_labels = ['test']
        text = sample
    elif type(sample) == list:
        doc_labels = sample
        text = None
    else:
        text = None
    print("Number of documents: {}".format(len(doc_labels)))
    print(
        f"Beginning Content Analysis with parameters: \n party: {party} | samplesize: {sample} | windowsize: {window_size}"
    )
    nlp = spacy.load("de_core_news_lg")
    ca = ContentAnalysis(nlp, window_size=window_size)
    entity_recognizer = EntityRecognizer(nlp)
    sentiment_recognizer = SentimentRecognizer(nlp)
    sentiws = spaCySentiWS(sentiws_path='sentiws/')
    # clf = TextClassification(nlp)
    # nlp.add_pipe(custom_lemma, last=True)
    nlp.add_pipe(custom_extensions, last=True)
    nlp.add_pipe(sentiment_recognizer, last=True)
    nlp.add_pipe(sentiws, last=True)
    nlp.add_pipe(entity_recognizer, last=True)
    nlp.add_pipe(ca, last=True)
    # nlp.add_pipe(clf, last=True)
    nlp.remove_pipe("ner")
    labels = []
    for label in tqdm(doc_labels):
        labels.append(label)
        if text:
            doc = nlp(text)
            if debug:
                for token in doc:
                    print(token.text, token.ent_type_, token._.is_elite_neg,
                          token._.is_attr, token._.is_negated, 'lemma',
                          token._.lemma)
        else:
            doc = nlp(gendocs(label))
        ca.results.labels.append(label)
    with open(f'res_ca/{directory}/labels.pkl', 'wb') as f:
        pickle.dump(labels, f)
    with open(f'res_ca/{directory}/results_all.pkl', 'wb') as f:
        pickle.dump(ca.results, f)
    print(f"Content Analysis complete. \nResults saved in {directory}/...")

    return ca.results
コード例 #4
0
    def setUpClass(cls):
        cls.nlp = spacy.load("de_core_news_lg", disable=["parser"])
        cls.preprocessing = Preprocessing()
        cls.preprocessing.nlp = spacy.load("de_core_news_lg",
                                           disable=["parser"])
        cls.preprocessing.sentiws = spaCySentiWS(
            sentiws_path="src/data/sentiws/")
        cls.preprocessing.nlp.add_pipe(cls.preprocessing.sentiws)

        nltk.download("punkt")
コード例 #5
0
 def __init__(self):
     self.logger = logging.getLogger()
     self.logger.setLevel(logging.DEBUG)
     self.logger.debug('Loading Spacy model')
     self.nlp = spacy.load('de')
     self.nlp.add_pipe(
         spaCyIWNLP(
             lemmatizer_path='data/IWNLP/IWNLP.Lemmatizer_20170501.json'))
     self.nlp.add_pipe(spaCySentiWS(sentiws_path='data/sentiws/'))
     self.logger.debug('Spacy loaded')
コード例 #6
0
def get_sentiment_scores(data, emoji_dict):
    nlp = de_core_news_sm.load()
    sentiws = spaCySentiWS(sentiws_path="data\sentiws")
    nlp.add_pipe(sentiws)
    scores = np.zeros((len(data), 1))
    for i in range(len(data)):
        doc = nlp(data[i])
        for j, token in enumerate(doc):
            if token._.sentiws:
                scores[i][0] += token._.sentiws
            elif str(token).startswith('U0') and len(str(token)) == 10:
                emoji = str(token)
                emoji = emoji.replace("U000", "0x")
                emoji = emoji.lower()
                if emoji in emoji_dict.keys():
                    scores[i][0] += emoji_dict[emoji]
    return scores
コード例 #7
0
            data_dict['positive_rating'].append(posting.positive_rating)
            data_dict['posting_text'].append(text)
            data_dict['sentiment'].append(sentiment)
            data_dict['entity'].append(ent.text)

    return pd.DataFrame(data_dict)


if __name__ == '__main__':
    t1 = datetime.datetime.now()
    args = parser.parse_args()
    if args.verbose:
        logger.setLevel(10)
    session = get_db_session(args.verbose)
    nlp = spacy.load("de_core_news_lg")
    sentiws = spaCySentiWS(sentiws_path='sentiws/')
    nlp.add_pipe(sentiws)
    # nlp = spacy.load("de_core_news_md")
    # nlp = spacy.load("de_core_news_sm")

    entities = ['fpö']

    for article in session.query(Article):
        logger.info(f"Getting sentiments article: {article.article_url}")
        classified_postings = get_classify_postings(article.article_id)
        logger.debug(classified_postings.describe())
        # for entity in entities:
        entity_df = classified_postings.loc[
            classified_postings['entity'].isin(entities) &
            classified_postings['sentiment'] != 0.0
        ]
コード例 #8
0
class GetSentiment():
    u"""Klasse zur Sentiment-Berechnung."""

    nlp = spacy.load('de_core_news_sm')
    sentiws = spaCySentiWS(sentiws_path='App/SentiWS_v2.0')
    nlp.add_pipe(sentiws)

    def get_sentis(self, replik):
        u"""
        Ordnet den Repliken einen Sentimentwert zu.

        Geht jedes Wort der Replik durch, löscht Stopwörter, und ordnet
        Sentiment zu. Alle Sentimentwerte werden addiert und als value der
        Replik eingesetzt.

        :param replik: Dictionary mit Beziehungen und Repliken
        :return: Dictionary mit hinzugefügten Sentimentwerten
        """
        for key in tqdm(replik):
            for innerkey in replik[key]:
                text = re.sub(r"\.{0,3},*!*\?*-*", "", innerkey)
                text = self.nlp(text)
                textwostop = ""
                senti = 0
                tokenanzahl = 0
                # replik[key][innerkey] = senti.sentiment.polarity
                for token in text:
                    if str(token) not in STOP_WORDS:
                        textwostop = textwostop + " " + str(token)
                textwostop = self.nlp(textwostop)
                for tok in textwostop:
                    lemma = tok.lemma_
                    lemma = self.nlp(lemma)
                    for lem in lemma:
                        if lem._.sentiws is None:
                            tokenanzahl = tokenanzahl + 1
                        else:
                            senti = senti + lem._.sentiws
                            tokenanzahl = tokenanzahl + 1
                if len(list(text)) == 1:
                    tokenanzahl = tokenanzahl - 1
                # replik[key][innerkey] = senti / tokenanzahl
                replik[key][innerkey] = senti
        return replik

    def average_senti(self, replik):
        u"""
        Rechnet alle Sentimentwerte der Beziehungen zusammen.

        Erstellt neues Dictionary mit Beziehung als Key und gesamten
        Sentimentwert als Value.

        :param replik: Replik-Dictionary mit Sentimentwerten
        :return: Dictionary mit Beziehungen und deren Sentimentwerte
        """
        all_in_all = {}
        for key in replik:
            gesamtsentiment = 0
            anzahlreplik = 0
            for innerkey, value in replik[key].items():
                anzahlreplik = anzahlreplik + 1
                gesamtsentiment = (gesamtsentiment + value)
                # all_in_all[key] = gesamtsentiment / anzahlreplik
                all_in_all[key] = gesamtsentiment
        return all_in_all

    def gesprocheneworte(self, replik):
        u"""
        Rechnet zueinander gesprochene Worte zusammen.

        :param replik: Replik-Dictionary mit Sentimentwerten
        :return: Dictionary mit Anzahl der gesprochenen Worte
        """
        gesprocheneworte = {}
        for key in replik:
            anzahlworte = 0
            for innerkey in replik[key]:
                for wort in innerkey.split():
                    anzahlworte += 1
            if anzahlworte == 0:
                pass
            else:
                gesprocheneworte[key] = anzahlworte

        return gesprocheneworte
コード例 #9
0
from textblob_de import TextBlobDE as TextBlob
from spacy_sentiws import spaCySentiWS
import spacy
import re
import nltk
import csv
from HanTa import HanoverTagger as ht
import hilfsprogramme.preprocessing as prep

nlp = spacy.load('de')
sentiws = spaCySentiWS(sentiws_path='/Users/pia/Desktop/Uni/SoSe2019/Drama Mining und Film Analyse/Projekt/figurennetzwerk/Senti Net 1.0/SentiWS_v2.0')
nlp.add_pipe(sentiws)


DDR_PATH = '/Users/pia/Desktop/Uni/Bachelor-Arbeit/DDR-BRD-comparison/Datenbeschaffung/Data/ddr_string.txt'
BRD_PATH = '/Users/pia/Desktop/Uni/Bachelor-Arbeit/DDR-BRD-comparison/Datenbeschaffung/Data/brd_string.txt'
paths = [DDR_PATH, BRD_PATH]    

def sentiment_analyse(hits):
    words = prep.get_words(hits)
    lemmalist = prep.get_lemmalist(words, "all")

    #wort = "Nacht"

    senti_dict = {}

    for wort in gleichhäufige_worte:
        indices = []
        """for i in range(len(lemmalist)):

        if lemmalist[i] == wort:
コード例 #10
0
    def full_summary(self):
        """performs sentiment analysis on full summary"""

        def _preprocess_text_with_spacy(summary, model):
            """returns text with lemmatized tokens"""
            one_summary = []
            doc = model(summary)
            for word in doc:
                if not word.is_stop and word.is_alpha:
                    if str(word) in ["Winkels", "Spinnen"]:
                        word = str(word)
                    else:
                        word = word.lemma_
                    one_summary.append(word)
            return ' '.join(one_summary)

        def _sentiment_analysis(summaries, model):
            """performs sentiment analysis with sentiWS, returns new DataFrame"""
            sentiment_scores = []
            summary_sentiment = pd.DataFrame()
            for i, summary in enumerate(summaries):
                doc = model(summary)
                sentiment_score = 0
                total_tokens = 0
                pos_tokens = 0
                neg_tokens = 0
                not_covered_tokens = 0
                positive_rating = 0
                negative_rating = 0
                pos_tokens_dict = {}
                neg_tokens_dict = {}
                for token in doc:
                    if not token.is_stop and token.is_alpha:
                        #                 token = token.lemma_
                        total_tokens += 1
                        if type(token._.sentiws) is float:
                            sentiment_score += token._.sentiws
                            if token._.sentiws > 0:
                                if str(token) not in pos_tokens_dict.keys():
                                    pos_tokens_dict[str(token)] = 1
                                else:
                                    pos_tokens_dict[str(token)] += 1
                                pos_tokens += 1
                                positive_rating += token._.sentiws
                            elif token._.sentiws < 0:
                                if str(token) not in neg_tokens_dict.keys():
                                    neg_tokens_dict[str(token)] = 1
                                else:
                                    neg_tokens_dict[str(token)] += 1
                                neg_tokens += 1
                                negative_rating += token._.sentiws
                        else:
                            not_covered_tokens += 1
                sentiment_score = sentiment_score / (pos_tokens + neg_tokens)
                positive_rating = positive_rating / pos_tokens
                negative_rating = negative_rating / neg_tokens

                print(i + 524, "SENTIMENT ANALYSIS: ")
                print("TOTAL SENTIMENT SCORE: ", sentiment_score)
                print("TOTAL TOKENS: ", total_tokens)
                print("TOKENS WITHOUT SENTIMENT: ", not_covered_tokens)
                print("POSITIVE TOKENS: ", pos_tokens)
                print("POSITIVE RATING: ", positive_rating)
                print("NEGATIVE TOKENS: ", neg_tokens)
                print("NEGATIVE RATING: ", negative_rating)
                sentiment_scores.append(sentiment_score)
                sentiment_distance = positive_rating - negative_rating
                print("SENTIMENT DISTANCE: ", sentiment_distance)
                pos_tokens_dict = {k: v for k, v in sorted(pos_tokens_dict.items(), key=lambda item: item[1], reverse=True)}
                print("POSITIVE TOKENS DICTIONARY: ", pos_tokens_dict)
                neg_tokens_dict = {k: v for k, v in sorted(neg_tokens_dict.items(), key=lambda item: item[1], reverse=True)}
                print("NEGATIVE TOKENS DICTIONARY: ", neg_tokens_dict, "\n")
                summary_sentiment = pd.concat([summary_sentiment, pd.DataFrame(
                    [[i + 524], [total_tokens], [sentiment_score], [not_covered_tokens], [pos_tokens],
                     [positive_rating], [neg_tokens], [negative_rating], [sentiment_distance], [str(summary)],
                     [pos_tokens_dict], [neg_tokens_dict]]).transpose()])
            return summary_sentiment


        df = pd.read_csv(self.source, index_col='text_id')
        print("Source loaded ..")
        nlp = spacy.load('de_core_news_md')
        print("Spacy model loaded ..")
        print("Starting preprocessing ..")
        df['processed_summary'] = df['discussion_summary'].apply(lambda row: _preprocess_text_with_spacy(row, nlp))
        print("Summaries preprocessed ..")
        summaries = df[~df['discussion_summary'].isna()]['processed_summary'].values
        sentiws = spaCySentiWS(sentiws_path=self.sentiws_path)
        nlp.add_pipe(sentiws)
        summary_sentiment = _sentiment_analysis(summaries, nlp)
        print("Sentiment analysis completed ..")
        summary_sentiment = summary_sentiment.rename(
            columns={0: "text_id", 1: "total_tokens", 2: "sentiment_score", 3: "tokens_without_sentiment",
                     4: "positive_tokens", 5: "positive_sentiment", 6: "negative_tokens", 7: "negative_sentiment",
                     8: "sentiment_distance", 9: "element_text", 10: "pos_tokens",
                     11: "neg_tokens"}).reset_index().drop(columns='index').set_index('text_id').copy()
        summary_sentiment.to_csv(self.output_path)
        print("DataFrame saved ..")
コード例 #11
0
    def detailed_summary(self):
        """performs sentiment analysis on summary details"""

        def _preprocess_text_with_spacy(summary, model):
            """returns text with lemmatized tokens"""
            one_summary = []
            doc = model(summary)
            for word in doc:
                if not word.is_stop and word.is_alpha:
                    if str(word) in ["Winkels", "Spinnen"]:
                        word = str(word)
                    else:
                        word = word.lemma_
                    one_summary.append(word)
            return ' '.join(one_summary)

        def _detailed_sentiment_analysis(df2, model):
            """performs detailed sentiment analysis with sentiWS, returns new DataFrame"""
            detailed_scores = pd.DataFrame()
            for author_id in range(524, (len(df2) + 524)):
                print(author_id)
                critics = critics_all[
                    critics_all.year == int(df2.iloc[author_id - 524].date_participation[:4])].full_name.values
                critics_last_names = [critic.split()[-1] for critic in critics]
                if author_id >= 717 and author_id <= 827:
                    split_summary = df2.iloc[author_id - 524]['discussion_summary'].split("\n")
                else:
                    split_summary = df2.iloc[author_id - 524]['discussion_summary'].split("\n\n")
                cleaned_split_summary = []
                for split in split_summary:
                    if split == "":
                        pass
                    else:
                        split = _preprocess_text_with_spacy(split, model)
                        cleaned_split_summary.append(split)
                for i, summary in enumerate(cleaned_split_summary):
                    talking_ids = []
                    talking = []
                    doc = nlp(summary)
                    sentiment_score = 0
                    total_tokens = 0
                    pos_tokens = 0
                    neg_tokens = 0
                    not_covered_tokens = 0
                    positive_rating = 0
                    negative_rating = 0
                    pos_tokens_dict = {}
                    neg_tokens_dict = {}
                    for token in doc:
                        if not token.is_stop and token.is_alpha:
                            #                 token = token.lemma_
                            total_tokens += 1
                            if type(token._.sentiws) is float:
                                sentiment_score += token._.sentiws
                                if token._.sentiws > 0:
                                    if str(token) not in pos_tokens_dict.keys():
                                        pos_tokens_dict[str(token)] = 1
                                    else:
                                        pos_tokens_dict[str(token)] += 1
                                    pos_tokens += 1
                                    positive_rating += token._.sentiws
                                elif token._.sentiws < 0:
                                    if str(token) not in neg_tokens_dict.keys():
                                        neg_tokens_dict[str(token)] = 1
                                    else:
                                        neg_tokens_dict[str(token)] += 1
                                    neg_tokens += 1
                                    negative_rating += token._.sentiws
                            else:
                                not_covered_tokens += 1
                    if pos_tokens > 0 or neg_tokens > 0:
                        sentiment_score = sentiment_score / (pos_tokens + neg_tokens)
                        if pos_tokens > 0:
                            positive_rating = positive_rating / pos_tokens
                        if neg_tokens > 0:
                            negative_rating = negative_rating / neg_tokens
                    sentiment_distance = positive_rating - negative_rating
                    for j, critic in enumerate(critics):
                        if (critic in split_summary[i]) or (critics_last_names[j] in split_summary[i]):
                            talking.append(critic)
                            talking_ids.append(str(critics_all[critics_all.full_name == critic]['critic_id'].values[0]))
                    print(author_id, i, ", ".join(talking), sentiment_score)
                    pos_tokens_dict = {k: v for k, v in sorted(pos_tokens_dict.items(), key=lambda item: item[1], reverse=True)}
                    neg_tokens_dict = {k: v for k, v in sorted(neg_tokens_dict.items(), key=lambda item: item[1], reverse=True)}
                    detailed_scores = pd.concat([detailed_scores, pd.DataFrame(
                        [[author_id], [i], ["|".join(talking_ids)], ["|".join(talking)], [total_tokens], [sentiment_score],
                         [not_covered_tokens], [pos_tokens], [positive_rating], [neg_tokens], [negative_rating],
                         [sentiment_distance], [split_summary[i]], [pos_tokens_dict], [neg_tokens_dict]]).transpose()])
            return detailed_scores

        df = pd.read_csv(self.source, index_col='text_id')
        nlp = spacy.load('de_core_news_md')
        sentiws = spaCySentiWS(sentiws_path=self.sentiws_path)
        nlp.add_pipe(sentiws)
        critics_participations = pd.read_csv('../data/database_csvs/critics_participations_all.csv')
        critics = pd.read_csv('../data/database_csvs/critics_updated.csv')
        critics_all = pd.merge(critics_participations, critics, left_on='critic_id', right_on='critic_id')
        critics_all["full_name"] = critics_all["first_name"].str.cat(
            critics_all[["middle_name", "last_name"]].fillna("").astype(str), sep=" ").str.replace("  ", " ")
        texts = pd.read_csv('../data/database_csvs/texts_updated.csv', index_col='text_id')
        df2 = pd.merge(df, texts[['date_participation']], left_on='text_id', right_on='text_id')
        detailed_summary_sentiments = _detailed_sentiment_analysis(df2, nlp)
        detailed_summary_sentiments = detailed_summary_sentiments.rename(
            columns={0: "text_id", 1: "element_position", 2: "critic_ids_talking", 3: "critics_talking",
                     4: "total_tokens", 5: "sentiment_score", 6: "tokens_without_sentiment", 7: "positive_tokens",
                     8: "positive_sentiment", 9: "negative_tokens", 10: "negative_sentiment", 11: "sentiment_distance",
                     12: "element_text", 13: "pos_tokens_dict", 14: "neg_tokens_dict"}).reset_index().drop(
            columns='index').copy()
        detailed_summary_sentiments.to_csv('detailed_sentiment_updated.csv')
コード例 #12
0
    def _apply_preprocessing(
        self, dataframe: DataFrame, document_type: DocumentType, filter_type: FilterType
    ) -> DataFrame:
        """
        Helper function responsible for applying preprocessing steps in correct order.
        :param dataframe: data that needs to be preprocessed.
        :param document_type: Type of the document that is going to be preprocessed.
        :param filter_type: Specifies if documents with no parties or multiple parties should be removed.
        :return: Preprocessed dataframe.
        """
        self.nlp = spacy.load("de_core_news_lg", disable=["parser"])
        self.sentiws = spaCySentiWS(sentiws_path="src/data/sentiws/")
        self.nlp.add_pipe(self.sentiws)

        nltk.download("punkt")

        print("Start of preprocessing")
        start_time = time.time()

        # Copy original dataframe
        df_preprocessed = dataframe.copy()

        # Convert string date into datetime
        if "date" in df_preprocessed:
            df_preprocessed["date"] = df_preprocessed["date"].apply(lambda date: date.split("T")[0])
            df_preprocessed["date"] = df_preprocessed["date"].replace(r"^\s*$", np.nan, regex=True)
            df_preprocessed["date"].astype("datetime64[ns]")

        df_preprocessed["original_text"] = df_preprocessed["text"]

        # Remove rows with quotations if document is a paragraph
        if document_type.value == DocumentType.PARAGRAPH.value:
            df_preprocessed = self._remove_quotations_rows(df_preprocessed)

        # Remove special characters
        df_preprocessed["text"] = self._remove_special_characters(df_preprocessed["text"])

        # Tokenization
        df_preprocessed["text"] = self._tokenization(df_preprocessed["text"])

        # Get persons
        df_preprocessed["persons"] = self._tag_persons(df_preprocessed["text"])

        # Get organizations
        df_preprocessed["organizations"] = self._tag_organizations(df_preprocessed["text"])

        # Get parties
        df_preprocessed["parties"] = self._get_parties(df_preprocessed["organizations"])

        # Remove rows with no parties
        if filter_type.value == FilterType.PARTIES.value:
            df_preprocessed = self._remove_rows_without_parties(df_preprocessed)

        # Remove rows with no parties or more than one party
        if filter_type.value == FilterType.SINGLE_PARTY.value:
            df_preprocessed = self._keep_rows_with_one_party(df_preprocessed)

        # Sentiment polarity sentiws
        df_preprocessed["polarity"] = self._determine_polarity_sentiws(df_preprocessed["text"])

        # Sentiment polarity TextBlob
        df_preprocessed["polarity_textblob"] = self._determine_polarity_textblob(df_preprocessed["original_text"])

        # POS tagging
        df_preprocessed["pos_tags"] = self._pos_tagging(df_preprocessed["text"])

        # Get nouns
        df_preprocessed["nouns"] = self._get_nouns(df_preprocessed["text"])

        # Lemmatization
        df_preprocessed["text"] = self._lemmatizing(df_preprocessed["text"])

        # Negation handling
        df_preprocessed = self._negation_handling(df_preprocessed)

        end_time = time.time()
        print("End of preprocessing after {} seconds".format(end_time - start_time))

        if filter_type.value != FilterType.NONE.value:
            print("Number of documents after filtering: {}".format(len(df_preprocessed)))

        return df_preprocessed
コード例 #13
0
 def setUpClass(self):
     self.nlp = spacy.load('de')
     sentiws = spaCySentiWS(sentiws_path='data/sentiws/')
     self.nlp.add_pipe(sentiws)