def progress(self, chunk): text = str(self.combo.currentText()) if text == 'German': from textblob_de import TextBlobDE as TextBlob logging.debug("Progress: %s" % text) logging.debug("Progress: %s" % chunk) blob = TextBlob(chunk) blob.sentences blob.tokens blob.tags blob.noun_phrases sent = blob.sentiment sentiment = "" if sent[0] < -0.20: sentiment = "Nicht Glücklich" elif sent[0] > 0.20: sentiment = "Glücklich" else: sentiment = "Neutral" print(chunk, ' , ', sentiment, ',', sent[0]) self.textboxTranscript.insertPlainText("Transcription === " + chunk) self.textboxTranscript.insertPlainText("\n--------------------") self.textboxTranscript.insertPlainText("\nSentiment === " + sentiment) self.show() else: from textblob import TextBlob blob = TextBlob(chunk) blob.sentences blob.tokens blob.tags blob.noun_phrases sent = blob.sentiment sentiment = "" if sent[0] < -0.20: sentiment = "Not happy" elif sent[0] > 0.20: sentiment = "Happy" else: sentiment = "Neutral" print(chunk, ' , ', sentiment, sent[0]) self.textboxTranscript.insertPlainText("Transcription === " + chunk) self.textboxTranscript.insertPlainText("\n--------------------") self.textboxTranscript.insertPlainText("\nSentiment === " + sentiment) self.show()
def progress(self, chunk): text = str(self.combo.currentText()) if text == 'German': from textblob_de import TextBlobDE as TextBlob logging.debug("Progress: %s" % text) logging.debug("Progress: %s" % chunk) blob = TextBlob(chunk) blob.sentences blob.tokens blob.tags blob.noun_phrases sent = blob.sentiment sentiment = "" if sent[0] < 1: sentiment = "Nicht Gut" elif -0.80 < sent[0] < 0.20: sentiment = "Neutral" else: sentiment = "Gut" print(chunk, ' , ', sentiment) self.textboxTranscript.insertPlainText(chunk) self.textboxTranscript.insertPlainText("\n*********************") self.textboxTranscript.insertPlainText( "\nUnd die sentiment auf das satze ist ===== " + sentiment) self.show() else: from textblob import TextBlob blob = TextBlob(chunk) blob.sentences blob.tokens blob.tags blob.noun_phrases sent = blob.sentiment sentiment = "" if sent[0] < .60: sentiment = "Not Happy" elif -0.80 < sent[0] < 0.20: sentiment = "Neutral" else: sentiment = "Happy" print(chunk, ' , ', sentiment) self.textboxTranscript.insertPlainText(chunk) self.textboxTranscript.insertPlainText("\n*********************") self.textboxTranscript.insertPlainText( "\nAnd the sentiment of that sentence is ===== " + sentiment) self.show()
def scrape(self, response): articleItem = response.meta['item'] articleItem['headline'] = response.css('h3::text').extract() articleItem['date_publish'] = response.css( "time::attr('title')").extract() articleItem['article_text'] = response.css( '.css-1jftgse p::text').extract() article_text = ''.join(articleItem['article_text']) articleItem['author'] = response.css( ".css-134vnn1 section:nth-child(3) li a span::text").extract() articleItem['author'] = ','.join(articleItem['author']) txt_blob = TextBlob(article_text) articleItem['sentiment'] = txt_blob.sentiment key_words = self.get_hotwords(article_text) top_key_words = [(kw[0] + ', ') for kw in Counter(key_words).most_common(7)] articleItem['keywords'] = ''.join(top_key_words) articleItem['summary'] = summarize(article_text, ratio=0.2) articleItem['link'] = response.url if 'Sekundäre Navigation' in articleItem['headline']: articleItem['headline'].remove('Sekundäre Navigation') for i in articleItem['date_publish']: pattern = re.compile(r'\d{2}.\d{2}.\d{4}') result = re.search(pattern, i) articleItem['date_publish'] = result.group() yield articleItem
def process_tweet(self, tweet): raw_text = tweet["text"] text = re.sub(self.cleanup_regex, "", raw_text, 0) print("[Tweet] %s" % text) tb = TextBlob(" ".join(self.tweet_buffer)) print(tb.sentiment.polarity) self.tweet_buffer.append(text) self.tweet_buffer = self.tweet_buffer[-1 * self.buffer_len:] self.tweet_count += 1 if self.tweet_count % 100 == 0: print("[Status] Tweet count: {}".format(self.tweet_count)) print("[Status] Tweets per second: {:.2}".format( self.tweet_count / (time.time() - self.time_start))) tb = TextBlob(" ".join(self.tweet_buffer)) print("Sentiment last {} tweets: {:.5}".format( self.buffer_len, tb.sentiment.polarity))
def lemmatize(text): # cleantext(text) # words = nltk.word_tokenize(str(text)) aa = cleantext(text) blob = TextBlob(str(aa)) return blob.words.lemmatize()
def get_most_significant_words(corpus): words = [] for text in corpus: blob = TextBlob(text) words += blob.words._collection return Counter(words)
def test_tag_blob_pattern_tok_include_punc(self): blob = TextBlob(self.text, tokenizer=PatternTokenizer(), pos_tagger=PatternTagger(include_punc=True)) tags = blob.tags logging.debug("tags: {0}".format(tags)) words = ["Das", "ist", "ein", "schönes", "Auto", "."] for i, word_tag in enumerate(tags): assert_equal(word_tag[0], words[i])
def test_tag_blob_defaults(self): blob = TextBlob(self.text) tags = blob.tags logging.debug("tags: {0}".format(tags)) words = ["Das", "ist", "ein", "schönes", "Auto"] for i, word_tag in enumerate(tags): assert_equal(word_tag[0], words[i]) assert_equal(tags[-1][0], "Auto")
def analize_sentiment(tweet): analysis = TextBlob(clean_tweet(tweet)) if analysis.sentiment.polarity > 0: return 1 elif analysis.sentiment.polarity == 0: return 0 else: return -1
def pipe_features_before_preprocessing(corpus): punctuation, caps_count, last_char = [], [], [] for text in corpus: blob = TextBlob(text) punctuation.append(feature_extraction.get_punctuation_vector(text)) caps_count.append(feature_extraction.get_caps_words_count(text)) last_char.append(feature_extraction.get_last_char_vector(text)) return punctuation, caps_count, last_char
def get_wv_vec_sequence(texts): texts_wv = [] for text in texts: text = pipe_preprocessing(text) blob = TextBlob(text) wv = feature_extraction.get_word_vec_repr(blob.words) texts_wv.append(wv.reshape((1, wv.shape[0], wv.shape[1]))) return texts_wv
def trending(channel_name): # return polarity in the range [-1.0, 1.0] cm = channel_messages(rocket, channel_name)['messages'] messages = " ".join(x["msg"] for x in cm) blob = TextBlob(messages) return { "trending topics": sorted(blob.word_counts.items(), key=lambda t: t[1], reverse=True) }
def sentiment_analysis(): """ Forms a sentiment from given sentence :return: sentiment ranging from -1 (bad) to 1 (good) while 0 equals neutral. """ data = request.get_json(force=True) print(data) sentence = data["sentence"] print(sentence) sentiment = TextBlob(sentence).sentiment return jsonify(sentiment)
def get_sentiment(self, tweet): """ get a setinement score for a specific tweet Args: tweet (string): Text of tweet Returns: float: Sentiment score """ analysis = TextBlob(tweet) return analysis.sentiment.polarity
def lemmatize_words(self, text): """ Lemmatize a string of words Args: text (string): Text whcih should be lemmatize Returns: list: List of lemmatized words """ analyse = TextBlob(text) wl = analyse.words.lemmatize() return wl
def predict(self, textarray): prediction = list() for text in tqdm(textarray): blob = TextBlob(text) sentiment = blob.sentiment.polarity if (sentiment == 0): prediction.append("neutral") if (sentiment > 0): prediction.append("positive") if (sentiment < 0): prediction.append("negative") return prediction
def sentiment_analyse(hits): words = prep.get_words(hits) lemmalist = prep.get_lemmalist(words, "all") #wort = "Nacht" senti_dict = {} for wort in gleichhäufige_worte: indices = [] """for i in range(len(lemmalist)): if lemmalist[i] == wort: indices.append(i) #print(indices)""" indices = [i for i, x in enumerate(lemmalist) if x == wort] senti_list = [] # get range from aspect for i in indices: #print(i) beginn = i-5 if beginn < 0: beginn = 0 end = i + 6 aspect = " ".join(lemmalist[beginn:end]).lower() #print(aspect) sentiment = 0 counter = 0 doc = nlp(aspect) for token in doc: if token._.sentiws is not None: #print(token, token._.sentiws) counter += 1 sentiment += token._.sentiws if counter != 0: sentiment = sentiment/counter senti_list.append(sentiment) blob3 = TextBlob(aspect) #print("TextBlob: ", blob3.sentiment) #print("spaCy_SentiWS: ", sentiment) print(f"Durchschnittlicher Sentimentwert von {wort} ist: {sum(senti_list)/len(senti_list)}") senti_dict[wort] = sum(senti_list)/len(senti_list) return senti_dict
def get_embedding_indices(texts, maxlen=None): texts_indices = [] for text in texts: text = pipe_preprocessing(text) blob = TextBlob(text) indices = feature_extraction.get_embedding_indices(blob.words) texts_indices.append(indices) if maxlen: return pad_sequences(texts_indices, maxlen=maxlen, padding='post', truncating='post') return texts_indices
def on_data(self, data): try: # decode json dict_data = json.loads(data) # pass tweet into TextBlob tweet = TextBlob(dict_data["text"]) if dict_data["lang"] != "de": return; print(tweet) # output sentiment polarity #print tweet.sentiment.polarity # determine if sentiment is positive, negative, or neutral if tweet.sentiment.polarity < 0: sentiment = "negative" elif tweet.sentiment.polarity == 0: sentiment = "neutral" else: sentiment = "positive" # output sentiment createTimestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(dict_data['created_at'],'%a %b %d %H:%M:%S +0000 %Y')) #print int(time.mktime(time.strptime(dict_data['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))) # add text and sentiment info to elasticsearch es.index(index="tweets", doc_type="tweet_sentiment", body={"date": createTimestamp, "user": dict_data["user"]["id_str"], "message": dict_data["text"], "language": dict_data["lang"], "hashtags": reduce(lambda x,y: { 'text' : str(x["text"]) + " " + str(y["text"]) }, dict_data["entities"]["hashtags"])["text"] if len(dict_data["entities"]["hashtags"])>0 else "", "location": dict_data["coordinates"]["coordinates"] if dict_data["coordinates"] is not None else None, "polarity": tweet.sentiment.polarity, "subjectivity": tweet.sentiment.subjectivity, "sentiment": sentiment}) self.tweet_count += 1 if self.tweet_count % 100 == 0: print('Indexed {0} tweets'.format(self.tweet_count)) except: print("error in listener:", sys.exc_info()[0]) return True
def test_tag_blob_nltk_tok_include_punc_stts(self): blob = TextBlob( self.text, tokenizer=NLTKPunktTokenizer(), pos_tagger=PatternTagger( include_punc=True, tagset='stts')) tags = blob.tags logging.debug("tags: {0}".format(tags)) words = ["Das", "ist", "ein", "schönes", "Auto", "."] stts_tags = ["PDS", "VVFIN", "ARTIND", "ADJA", "NN", "S"] for i, word_tag in enumerate(tags): assert_equal(word_tag[0], words[i]) assert_equal(word_tag[1], stts_tags[i]) assert_equal(tags[-1][0], ".")
def sentiment_tweet_analysis(df: pd.DataFrame) -> pd.DataFrame: """ Processes full tweets in 'text' column to sentiment values, based on TextBlobDE. :param df: pd.DataFrame, containing preprocessed tweets to analyze :return: pd.DataFrame, containing sentiment values """ # TODO: Make language compatible sentiment analysis (Problems might arise when using TextBlobDE on 'eng' tweets) tweet_sentiments = [TextBlob(tweet) for tweet in df['text']] sentiment_vals = [ objects.sentiment.polarity for objects in tweet_sentiments ] return pd.DataFrame(list( zip(sentiment_vals, [tweet for tweet in df['text']])), columns=["polarity", "tweet"])
def sentiment_word_analysis(df: pd.DataFrame) -> pd.DataFrame: """ Processes words in 'text' column to sentiment values, based on TextBlobDE. :param df: pd.DataFrame, containing preprocessed tweets to analyze :return: pd.DataFrame, containing sentiment values """ # TODO: Make language compatible sentiment analysis (Problems might arise when using TextBlobDE on 'eng' tweets) word_sentiments = [[TextBlob(word) for word in tweet] for tweet in df['text'].apply(lambda x: x.split())] sentiment_vals = [[[word.sentiment.polarity, str(word)] for word in tweet] for tweet in word_sentiments] return pd.DataFrame(list(itertools.chain(*sentiment_vals)), columns=["polarity", "word"])
def sentiment(self, text): pos_count = 0 pos_correct = 0 neg_count = 0 neg_correct = 0 for line in text.split('.'): analysis = TextBlob(line) if analysis.sentiment.polarity > 0: pos_correct += 1 pos_count += 1 if analysis.sentiment.polarity <= 0: neg_correct += 1 neg_count += 1 return "Positive accuracy = {:0.2f}%".format(pos_correct/pos_count * 100.0) + \ " Negative accuracy = {:0.2f}%".format(neg_correct/neg_count * 100.0)
def get_wv_vec(texts, max_dimension=6000): texts_wv = [] for text in texts: text = pipe_preprocessing(text) blob = TextBlob(text) wv = feature_extraction.get_word_vec_repr(blob.words) wv = wv.reshape([1, wv.shape[1] * wv.shape[0]]) if wv.shape[1] > max_dimension: wv = np.reshape(wv[0, :max_dimension], (1, max_dimension)) elif wv.shape[1] < max_dimension: wv = np.pad(wv, ((0, 0), (0, max_dimension - wv.shape[1])), 'constant', constant_values=0) texts_wv.append(wv) return np.reshape(np.array(texts_wv), [len(texts_wv), max_dimension])
def get_sentiment(number): site = util.get_session_value(session.attributes, "siteName") if site is not None: obj = get_site_obj(site) else: session.attributes["lastCall"] = "senti" return question("Wonach wollen Sie suchen?") if obj is None: # should never be called return question("Error. Wonach wollen Sie suchen?") links = util.get_session_value(session.attributes, "lastSearch") # if the site uses relative links, make absolute ones if str(links).count("http") < len(links): newLinks = [] for link in links: if "http" not in link: newLinks.append(obj.baseURL + link) else: newLinks.append(link) links = newLinks if int(number) > len(links): return question( "Dieser Artikel existiert leider nicht, versuchen Sie eine andere Nummer." ) url = links[int(number) - 1] NewsText = obj.read_article(url) newText = "" for text in NewsText: newText += text newText = TextBlob(newText) sent = newText.sentiment[0] if sent < 0: good = "eher negativ" else: good = "positiv" return question("Das Sentiment ist " + good)
def metricate(tweetdf): """ takes a dataframe with tweets from one user and calculates some metrics for it. returns these values as dict """ if len(tweetdf) == 0: return None user = tweetdf.name_id[0] #percent of retweets and replies of total tweets retweet_rate = tweetdf.t_isrt.mean() replie_rate = tweetdf.t_isrpl.mean() #now i want the tweets per day ix = len(tweetdf.index) - 1 last_tweet = tweetdf.t_date[ix].to_pydatetime() td = datetime.datetime.now() - last_tweet days = td.total_seconds() / (3600 * 24) ts_perday = len(tweetdf) / days #hashtags hs = "" hashtags = tweetdf.t_hashtags hashtags = hashtags.explode().dropna() for h in hashtags: hs += h + ", " #sentiment analysis (experimental) sentiments = [] for text in tweetdf.t_text.values: blob = TextBlob(text) sent = blob.polarity sentiments.append(sent) avg_sent = np.mean(sentiments) return { "name_id": user, "retweet_rate": retweet_rate, "replie_rate": replie_rate, "ts_perday": ts_perday, "hashtags": hs, "avg_sent": avg_sent }
def lemmatized_NLTK(self): seen = [] blob = self.blob for word, lemma, tag in zip(blob.words, blob.words.lemmatize(), blob.tags): if tag[1] == 'JJ' or tag[1] == 'VB': if (word, lemma) in seen: continue else: seen.append((word, lemma)) self._lemmatized_NLTK.append((word, lemma)) tmp = [] for chunk in self.blob.noun_phrases: # blob the chunk and lemmatize it lemmatized_chunk = ' '.join(TextBlob(chunk).words.lemmatize()) tmp.append((chunk, lemmatized_chunk)) # self._lemmatized_NLTK.append(self.noun_chunks_NLTK) self._lemmatized_NLTK += tmp return self._lemmatized_NLTK
def get_sentiment(row): # last column of each row is the text # polarity and subjectivity blob = TextBlob(row[-1]) mood = blob.sentiment pol_score = mood.polarity sub_score = mood.subjectivity # polarity is a float within the range [-1.0, 1.0] # subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. # readability rb = readability.getmeasures(row[-1], lang='de') rb_score = rb['readability grades']['FleschReadingEase'] return [row, pol_score, sub_score, rb_score]
def pipe_features_after_preprocessing(corpus): sepl, word_vec, sum_vec, pos_similarity, neg_similarity, polarity_score, subjectivity_score = [], [], [], [], [], \ [], [] for text in corpus: if len(text ) > 0: # after preprocessing the text could have 0 words left blob = TextBlob(text) sepl.append(feature_extraction.get_sentiment_phrase_score(text)) word_vec.append(feature_extraction.get_word_vec_repr(blob.words)) sum_vec.append(feature_extraction.get_sum_vec(blob.words)) pos_similarity.append( feature_extraction.get_positive_word_vec_similarity( blob.words)) neg_similarity.append( feature_extraction.get_negative_word_vec_similarity( blob.words)) polarity_score.append(feature_extraction.get_polarity_score(text)) subjectivity_score.append( feature_extraction.get_subjectivity_score(text)) msw = get_most_significant_words(corpus) return sepl, pos_similarity, neg_similarity, polarity_score, word_vec, sum_vec, subjectivity_score, msw
def pipe_features_after_preprocessing(text): if len(text) > 0: # after preprocessing the text could have 0 words left blob = TextBlob(text) feature_vec = feature_extraction.get_sentiment_phrase_score(text) feature_vec = np.concatenate( (feature_vec, feature_extraction.get_positive_word_vec_similarity(blob.words)), axis=1) feature_vec = np.concatenate( (feature_vec, feature_extraction.get_negative_word_vec_similarity(blob.words)), axis=1) feature_vec = np.concatenate( (feature_vec, feature_extraction.get_polarity_score(text)), axis=1) feature_vec = np.concatenate( (feature_vec, feature_extraction.get_subjectivity_score(text)), axis=1) return feature_vec return np.zeros([1, 10])