def analyze_text(text): nlp = spacy.load('de_core_news_sm') nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(text) sentences = [] for sent in doc.sents: sentences.append(sent) sentiws = spaCySentiWS(sentiws_path='data/sentiws/') nlp.add_pipe(sentiws) results = [] for sentence in sentences: doc = nlp(sentence.text) filtered_doc = [] for word in doc: if not word.is_stop: filtered_doc.append(word) for token in filtered_doc: if token._.sentiws: if token._.sentiws < -0.3: results.append({ "text": sentence.text, "danger": [token.text], "danger_value": token._.sentiws, "danger_obj": token.pos_ }) return results
def __init__(self, wordvecs=False, sentiws_path='data/sentiws/', polarity_modifiers_path='data/polarity_modifiers.pickle'): """ Parameters: - wordvecs: True or False; use de_core_news_sm or de_core_news_md german spacy model - sentiws_path: path of your sentiws data - polarity_modifiers_path: path of your polarity modifier dict as pickle """ # loading german spacy model if wordvecs: self.nlp = spacy.load('de_core_news_md') else: self.nlp = spacy.load('de_core_news_sm') # integrating SentiWS-Corpus as token attribute sentiws = spaCySentiWS(sentiws_path=sentiws_path) self.nlp.add_pipe(sentiws) self.doc = None self.modifiers = pickle.load(open(polarity_modifiers_path, 'rb')) if not Token.has_extension("modified"): Token.set_extension("modified", getter=self.modify_polarity) if not Token.has_extension("negated"): Token.set_extension("negated", getter=self.negate)
def content_analysis(directory, party="all", sample=None, window_size=25, debug=False): if directory != 'test': Path(f"res_ca/{directory}/").mkdir(parents=False, exist_ok=False) doc_labels = load_data(party) if type(sample) == int: doc_labels = random.sample(doc_labels, sample) text = None elif type(sample) == str: doc_labels = ['test'] text = sample elif type(sample) == list: doc_labels = sample text = None else: text = None print("Number of documents: {}".format(len(doc_labels))) print( f"Beginning Content Analysis with parameters: \n party: {party} | samplesize: {sample} | windowsize: {window_size}" ) nlp = spacy.load("de_core_news_lg") ca = ContentAnalysis(nlp, window_size=window_size) entity_recognizer = EntityRecognizer(nlp) sentiment_recognizer = SentimentRecognizer(nlp) sentiws = spaCySentiWS(sentiws_path='sentiws/') # clf = TextClassification(nlp) # nlp.add_pipe(custom_lemma, last=True) nlp.add_pipe(custom_extensions, last=True) nlp.add_pipe(sentiment_recognizer, last=True) nlp.add_pipe(sentiws, last=True) nlp.add_pipe(entity_recognizer, last=True) nlp.add_pipe(ca, last=True) # nlp.add_pipe(clf, last=True) nlp.remove_pipe("ner") labels = [] for label in tqdm(doc_labels): labels.append(label) if text: doc = nlp(text) if debug: for token in doc: print(token.text, token.ent_type_, token._.is_elite_neg, token._.is_attr, token._.is_negated, 'lemma', token._.lemma) else: doc = nlp(gendocs(label)) ca.results.labels.append(label) with open(f'res_ca/{directory}/labels.pkl', 'wb') as f: pickle.dump(labels, f) with open(f'res_ca/{directory}/results_all.pkl', 'wb') as f: pickle.dump(ca.results, f) print(f"Content Analysis complete. \nResults saved in {directory}/...") return ca.results
def setUpClass(cls): cls.nlp = spacy.load("de_core_news_lg", disable=["parser"]) cls.preprocessing = Preprocessing() cls.preprocessing.nlp = spacy.load("de_core_news_lg", disable=["parser"]) cls.preprocessing.sentiws = spaCySentiWS( sentiws_path="src/data/sentiws/") cls.preprocessing.nlp.add_pipe(cls.preprocessing.sentiws) nltk.download("punkt")
def __init__(self): self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG) self.logger.debug('Loading Spacy model') self.nlp = spacy.load('de') self.nlp.add_pipe( spaCyIWNLP( lemmatizer_path='data/IWNLP/IWNLP.Lemmatizer_20170501.json')) self.nlp.add_pipe(spaCySentiWS(sentiws_path='data/sentiws/')) self.logger.debug('Spacy loaded')
def get_sentiment_scores(data, emoji_dict): nlp = de_core_news_sm.load() sentiws = spaCySentiWS(sentiws_path="data\sentiws") nlp.add_pipe(sentiws) scores = np.zeros((len(data), 1)) for i in range(len(data)): doc = nlp(data[i]) for j, token in enumerate(doc): if token._.sentiws: scores[i][0] += token._.sentiws elif str(token).startswith('U0') and len(str(token)) == 10: emoji = str(token) emoji = emoji.replace("U000", "0x") emoji = emoji.lower() if emoji in emoji_dict.keys(): scores[i][0] += emoji_dict[emoji] return scores
data_dict['positive_rating'].append(posting.positive_rating) data_dict['posting_text'].append(text) data_dict['sentiment'].append(sentiment) data_dict['entity'].append(ent.text) return pd.DataFrame(data_dict) if __name__ == '__main__': t1 = datetime.datetime.now() args = parser.parse_args() if args.verbose: logger.setLevel(10) session = get_db_session(args.verbose) nlp = spacy.load("de_core_news_lg") sentiws = spaCySentiWS(sentiws_path='sentiws/') nlp.add_pipe(sentiws) # nlp = spacy.load("de_core_news_md") # nlp = spacy.load("de_core_news_sm") entities = ['fpö'] for article in session.query(Article): logger.info(f"Getting sentiments article: {article.article_url}") classified_postings = get_classify_postings(article.article_id) logger.debug(classified_postings.describe()) # for entity in entities: entity_df = classified_postings.loc[ classified_postings['entity'].isin(entities) & classified_postings['sentiment'] != 0.0 ]
class GetSentiment(): u"""Klasse zur Sentiment-Berechnung.""" nlp = spacy.load('de_core_news_sm') sentiws = spaCySentiWS(sentiws_path='App/SentiWS_v2.0') nlp.add_pipe(sentiws) def get_sentis(self, replik): u""" Ordnet den Repliken einen Sentimentwert zu. Geht jedes Wort der Replik durch, löscht Stopwörter, und ordnet Sentiment zu. Alle Sentimentwerte werden addiert und als value der Replik eingesetzt. :param replik: Dictionary mit Beziehungen und Repliken :return: Dictionary mit hinzugefügten Sentimentwerten """ for key in tqdm(replik): for innerkey in replik[key]: text = re.sub(r"\.{0,3},*!*\?*-*", "", innerkey) text = self.nlp(text) textwostop = "" senti = 0 tokenanzahl = 0 # replik[key][innerkey] = senti.sentiment.polarity for token in text: if str(token) not in STOP_WORDS: textwostop = textwostop + " " + str(token) textwostop = self.nlp(textwostop) for tok in textwostop: lemma = tok.lemma_ lemma = self.nlp(lemma) for lem in lemma: if lem._.sentiws is None: tokenanzahl = tokenanzahl + 1 else: senti = senti + lem._.sentiws tokenanzahl = tokenanzahl + 1 if len(list(text)) == 1: tokenanzahl = tokenanzahl - 1 # replik[key][innerkey] = senti / tokenanzahl replik[key][innerkey] = senti return replik def average_senti(self, replik): u""" Rechnet alle Sentimentwerte der Beziehungen zusammen. Erstellt neues Dictionary mit Beziehung als Key und gesamten Sentimentwert als Value. :param replik: Replik-Dictionary mit Sentimentwerten :return: Dictionary mit Beziehungen und deren Sentimentwerte """ all_in_all = {} for key in replik: gesamtsentiment = 0 anzahlreplik = 0 for innerkey, value in replik[key].items(): anzahlreplik = anzahlreplik + 1 gesamtsentiment = (gesamtsentiment + value) # all_in_all[key] = gesamtsentiment / anzahlreplik all_in_all[key] = gesamtsentiment return all_in_all def gesprocheneworte(self, replik): u""" Rechnet zueinander gesprochene Worte zusammen. :param replik: Replik-Dictionary mit Sentimentwerten :return: Dictionary mit Anzahl der gesprochenen Worte """ gesprocheneworte = {} for key in replik: anzahlworte = 0 for innerkey in replik[key]: for wort in innerkey.split(): anzahlworte += 1 if anzahlworte == 0: pass else: gesprocheneworte[key] = anzahlworte return gesprocheneworte
from textblob_de import TextBlobDE as TextBlob from spacy_sentiws import spaCySentiWS import spacy import re import nltk import csv from HanTa import HanoverTagger as ht import hilfsprogramme.preprocessing as prep nlp = spacy.load('de') sentiws = spaCySentiWS(sentiws_path='/Users/pia/Desktop/Uni/SoSe2019/Drama Mining und Film Analyse/Projekt/figurennetzwerk/Senti Net 1.0/SentiWS_v2.0') nlp.add_pipe(sentiws) DDR_PATH = '/Users/pia/Desktop/Uni/Bachelor-Arbeit/DDR-BRD-comparison/Datenbeschaffung/Data/ddr_string.txt' BRD_PATH = '/Users/pia/Desktop/Uni/Bachelor-Arbeit/DDR-BRD-comparison/Datenbeschaffung/Data/brd_string.txt' paths = [DDR_PATH, BRD_PATH] def sentiment_analyse(hits): words = prep.get_words(hits) lemmalist = prep.get_lemmalist(words, "all") #wort = "Nacht" senti_dict = {} for wort in gleichhäufige_worte: indices = [] """for i in range(len(lemmalist)): if lemmalist[i] == wort:
def full_summary(self): """performs sentiment analysis on full summary""" def _preprocess_text_with_spacy(summary, model): """returns text with lemmatized tokens""" one_summary = [] doc = model(summary) for word in doc: if not word.is_stop and word.is_alpha: if str(word) in ["Winkels", "Spinnen"]: word = str(word) else: word = word.lemma_ one_summary.append(word) return ' '.join(one_summary) def _sentiment_analysis(summaries, model): """performs sentiment analysis with sentiWS, returns new DataFrame""" sentiment_scores = [] summary_sentiment = pd.DataFrame() for i, summary in enumerate(summaries): doc = model(summary) sentiment_score = 0 total_tokens = 0 pos_tokens = 0 neg_tokens = 0 not_covered_tokens = 0 positive_rating = 0 negative_rating = 0 pos_tokens_dict = {} neg_tokens_dict = {} for token in doc: if not token.is_stop and token.is_alpha: # token = token.lemma_ total_tokens += 1 if type(token._.sentiws) is float: sentiment_score += token._.sentiws if token._.sentiws > 0: if str(token) not in pos_tokens_dict.keys(): pos_tokens_dict[str(token)] = 1 else: pos_tokens_dict[str(token)] += 1 pos_tokens += 1 positive_rating += token._.sentiws elif token._.sentiws < 0: if str(token) not in neg_tokens_dict.keys(): neg_tokens_dict[str(token)] = 1 else: neg_tokens_dict[str(token)] += 1 neg_tokens += 1 negative_rating += token._.sentiws else: not_covered_tokens += 1 sentiment_score = sentiment_score / (pos_tokens + neg_tokens) positive_rating = positive_rating / pos_tokens negative_rating = negative_rating / neg_tokens print(i + 524, "SENTIMENT ANALYSIS: ") print("TOTAL SENTIMENT SCORE: ", sentiment_score) print("TOTAL TOKENS: ", total_tokens) print("TOKENS WITHOUT SENTIMENT: ", not_covered_tokens) print("POSITIVE TOKENS: ", pos_tokens) print("POSITIVE RATING: ", positive_rating) print("NEGATIVE TOKENS: ", neg_tokens) print("NEGATIVE RATING: ", negative_rating) sentiment_scores.append(sentiment_score) sentiment_distance = positive_rating - negative_rating print("SENTIMENT DISTANCE: ", sentiment_distance) pos_tokens_dict = {k: v for k, v in sorted(pos_tokens_dict.items(), key=lambda item: item[1], reverse=True)} print("POSITIVE TOKENS DICTIONARY: ", pos_tokens_dict) neg_tokens_dict = {k: v for k, v in sorted(neg_tokens_dict.items(), key=lambda item: item[1], reverse=True)} print("NEGATIVE TOKENS DICTIONARY: ", neg_tokens_dict, "\n") summary_sentiment = pd.concat([summary_sentiment, pd.DataFrame( [[i + 524], [total_tokens], [sentiment_score], [not_covered_tokens], [pos_tokens], [positive_rating], [neg_tokens], [negative_rating], [sentiment_distance], [str(summary)], [pos_tokens_dict], [neg_tokens_dict]]).transpose()]) return summary_sentiment df = pd.read_csv(self.source, index_col='text_id') print("Source loaded ..") nlp = spacy.load('de_core_news_md') print("Spacy model loaded ..") print("Starting preprocessing ..") df['processed_summary'] = df['discussion_summary'].apply(lambda row: _preprocess_text_with_spacy(row, nlp)) print("Summaries preprocessed ..") summaries = df[~df['discussion_summary'].isna()]['processed_summary'].values sentiws = spaCySentiWS(sentiws_path=self.sentiws_path) nlp.add_pipe(sentiws) summary_sentiment = _sentiment_analysis(summaries, nlp) print("Sentiment analysis completed ..") summary_sentiment = summary_sentiment.rename( columns={0: "text_id", 1: "total_tokens", 2: "sentiment_score", 3: "tokens_without_sentiment", 4: "positive_tokens", 5: "positive_sentiment", 6: "negative_tokens", 7: "negative_sentiment", 8: "sentiment_distance", 9: "element_text", 10: "pos_tokens", 11: "neg_tokens"}).reset_index().drop(columns='index').set_index('text_id').copy() summary_sentiment.to_csv(self.output_path) print("DataFrame saved ..")
def detailed_summary(self): """performs sentiment analysis on summary details""" def _preprocess_text_with_spacy(summary, model): """returns text with lemmatized tokens""" one_summary = [] doc = model(summary) for word in doc: if not word.is_stop and word.is_alpha: if str(word) in ["Winkels", "Spinnen"]: word = str(word) else: word = word.lemma_ one_summary.append(word) return ' '.join(one_summary) def _detailed_sentiment_analysis(df2, model): """performs detailed sentiment analysis with sentiWS, returns new DataFrame""" detailed_scores = pd.DataFrame() for author_id in range(524, (len(df2) + 524)): print(author_id) critics = critics_all[ critics_all.year == int(df2.iloc[author_id - 524].date_participation[:4])].full_name.values critics_last_names = [critic.split()[-1] for critic in critics] if author_id >= 717 and author_id <= 827: split_summary = df2.iloc[author_id - 524]['discussion_summary'].split("\n") else: split_summary = df2.iloc[author_id - 524]['discussion_summary'].split("\n\n") cleaned_split_summary = [] for split in split_summary: if split == "": pass else: split = _preprocess_text_with_spacy(split, model) cleaned_split_summary.append(split) for i, summary in enumerate(cleaned_split_summary): talking_ids = [] talking = [] doc = nlp(summary) sentiment_score = 0 total_tokens = 0 pos_tokens = 0 neg_tokens = 0 not_covered_tokens = 0 positive_rating = 0 negative_rating = 0 pos_tokens_dict = {} neg_tokens_dict = {} for token in doc: if not token.is_stop and token.is_alpha: # token = token.lemma_ total_tokens += 1 if type(token._.sentiws) is float: sentiment_score += token._.sentiws if token._.sentiws > 0: if str(token) not in pos_tokens_dict.keys(): pos_tokens_dict[str(token)] = 1 else: pos_tokens_dict[str(token)] += 1 pos_tokens += 1 positive_rating += token._.sentiws elif token._.sentiws < 0: if str(token) not in neg_tokens_dict.keys(): neg_tokens_dict[str(token)] = 1 else: neg_tokens_dict[str(token)] += 1 neg_tokens += 1 negative_rating += token._.sentiws else: not_covered_tokens += 1 if pos_tokens > 0 or neg_tokens > 0: sentiment_score = sentiment_score / (pos_tokens + neg_tokens) if pos_tokens > 0: positive_rating = positive_rating / pos_tokens if neg_tokens > 0: negative_rating = negative_rating / neg_tokens sentiment_distance = positive_rating - negative_rating for j, critic in enumerate(critics): if (critic in split_summary[i]) or (critics_last_names[j] in split_summary[i]): talking.append(critic) talking_ids.append(str(critics_all[critics_all.full_name == critic]['critic_id'].values[0])) print(author_id, i, ", ".join(talking), sentiment_score) pos_tokens_dict = {k: v for k, v in sorted(pos_tokens_dict.items(), key=lambda item: item[1], reverse=True)} neg_tokens_dict = {k: v for k, v in sorted(neg_tokens_dict.items(), key=lambda item: item[1], reverse=True)} detailed_scores = pd.concat([detailed_scores, pd.DataFrame( [[author_id], [i], ["|".join(talking_ids)], ["|".join(talking)], [total_tokens], [sentiment_score], [not_covered_tokens], [pos_tokens], [positive_rating], [neg_tokens], [negative_rating], [sentiment_distance], [split_summary[i]], [pos_tokens_dict], [neg_tokens_dict]]).transpose()]) return detailed_scores df = pd.read_csv(self.source, index_col='text_id') nlp = spacy.load('de_core_news_md') sentiws = spaCySentiWS(sentiws_path=self.sentiws_path) nlp.add_pipe(sentiws) critics_participations = pd.read_csv('../data/database_csvs/critics_participations_all.csv') critics = pd.read_csv('../data/database_csvs/critics_updated.csv') critics_all = pd.merge(critics_participations, critics, left_on='critic_id', right_on='critic_id') critics_all["full_name"] = critics_all["first_name"].str.cat( critics_all[["middle_name", "last_name"]].fillna("").astype(str), sep=" ").str.replace(" ", " ") texts = pd.read_csv('../data/database_csvs/texts_updated.csv', index_col='text_id') df2 = pd.merge(df, texts[['date_participation']], left_on='text_id', right_on='text_id') detailed_summary_sentiments = _detailed_sentiment_analysis(df2, nlp) detailed_summary_sentiments = detailed_summary_sentiments.rename( columns={0: "text_id", 1: "element_position", 2: "critic_ids_talking", 3: "critics_talking", 4: "total_tokens", 5: "sentiment_score", 6: "tokens_without_sentiment", 7: "positive_tokens", 8: "positive_sentiment", 9: "negative_tokens", 10: "negative_sentiment", 11: "sentiment_distance", 12: "element_text", 13: "pos_tokens_dict", 14: "neg_tokens_dict"}).reset_index().drop( columns='index').copy() detailed_summary_sentiments.to_csv('detailed_sentiment_updated.csv')
def _apply_preprocessing( self, dataframe: DataFrame, document_type: DocumentType, filter_type: FilterType ) -> DataFrame: """ Helper function responsible for applying preprocessing steps in correct order. :param dataframe: data that needs to be preprocessed. :param document_type: Type of the document that is going to be preprocessed. :param filter_type: Specifies if documents with no parties or multiple parties should be removed. :return: Preprocessed dataframe. """ self.nlp = spacy.load("de_core_news_lg", disable=["parser"]) self.sentiws = spaCySentiWS(sentiws_path="src/data/sentiws/") self.nlp.add_pipe(self.sentiws) nltk.download("punkt") print("Start of preprocessing") start_time = time.time() # Copy original dataframe df_preprocessed = dataframe.copy() # Convert string date into datetime if "date" in df_preprocessed: df_preprocessed["date"] = df_preprocessed["date"].apply(lambda date: date.split("T")[0]) df_preprocessed["date"] = df_preprocessed["date"].replace(r"^\s*$", np.nan, regex=True) df_preprocessed["date"].astype("datetime64[ns]") df_preprocessed["original_text"] = df_preprocessed["text"] # Remove rows with quotations if document is a paragraph if document_type.value == DocumentType.PARAGRAPH.value: df_preprocessed = self._remove_quotations_rows(df_preprocessed) # Remove special characters df_preprocessed["text"] = self._remove_special_characters(df_preprocessed["text"]) # Tokenization df_preprocessed["text"] = self._tokenization(df_preprocessed["text"]) # Get persons df_preprocessed["persons"] = self._tag_persons(df_preprocessed["text"]) # Get organizations df_preprocessed["organizations"] = self._tag_organizations(df_preprocessed["text"]) # Get parties df_preprocessed["parties"] = self._get_parties(df_preprocessed["organizations"]) # Remove rows with no parties if filter_type.value == FilterType.PARTIES.value: df_preprocessed = self._remove_rows_without_parties(df_preprocessed) # Remove rows with no parties or more than one party if filter_type.value == FilterType.SINGLE_PARTY.value: df_preprocessed = self._keep_rows_with_one_party(df_preprocessed) # Sentiment polarity sentiws df_preprocessed["polarity"] = self._determine_polarity_sentiws(df_preprocessed["text"]) # Sentiment polarity TextBlob df_preprocessed["polarity_textblob"] = self._determine_polarity_textblob(df_preprocessed["original_text"]) # POS tagging df_preprocessed["pos_tags"] = self._pos_tagging(df_preprocessed["text"]) # Get nouns df_preprocessed["nouns"] = self._get_nouns(df_preprocessed["text"]) # Lemmatization df_preprocessed["text"] = self._lemmatizing(df_preprocessed["text"]) # Negation handling df_preprocessed = self._negation_handling(df_preprocessed) end_time = time.time() print("End of preprocessing after {} seconds".format(end_time - start_time)) if filter_type.value != FilterType.NONE.value: print("Number of documents after filtering: {}".format(len(df_preprocessed))) return df_preprocessed
def setUpClass(self): self.nlp = spacy.load('de') sentiws = spaCySentiWS(sentiws_path='data/sentiws/') self.nlp.add_pipe(sentiws)