def prelayer_processor(news_df): sia = SentimentIntensityAnalyzer() conca_, filtred_news = [], [] news_df = news_df[['summary', 'time', 'symbol']] news_dic = {} for i in news_df.index: tmp_iloc = news_df.iloc[i] conca_ += tmp_iloc['summary'] news_dic[i] = [tmp_iloc['time'], tmp_iloc['summary']] word_enumerate(conca_) news_idx = news_storing(news_dic) words_idx = [] for article in range(len(news_idx)): tmp_ = freq_filter(news_idx[article][1]) words_idx += tmp_ tmp_word = [] for i in tmp_: tmp_word.append( news_idx[article][0][news_idx[article][1].index(i)]) news_idx[article][1] = tmp_word filtred_news.append(tmp_word) conca_sep = [] for i in news_idx: conca_sep.append(list(i[1])) vader_sentiment_, distance_ = [], [] conca_ = sum(conca_sep, []) words_ref = list(sia.make_lex_dict().keys()) for i in conca_: distance_.append(absolute_distance(i)) sent_ = sia.polarity_scores(i)['compound'] if not sent_: sent_ = approximate_sentiment(i, words_ref, sia) vader_sentiment_.append(sent_) words_position = np.asarray(sum( [[round((j + 1) / len(i), 4) for j in range(len(i))] for i in conca_sep], []), dtype='float64') symbols = sum([[i] * 8 for i in list(news_df['symbol'])], []) df = pd.DataFrame( np.array( [distance_, vader_sentiment_, words_position, words_idx, symbols]).transpose(), index=[i for i in conca_], columns=['distance', 'sentiment', 'position', 'word_id', 'symbol']) df['distance'] = pd.to_numeric(df['distance'], errors='coerce') df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce') df['position'] = pd.to_numeric(df['position'], errors='coerce') df['word_id'] = pd.to_numeric(df['word_id'], errors='coerce') for col in list(df.columns)[:-1]: c_ = df[col] df[col] = (c_ - c_.mean()) / c_.std(ddof=0) df = df.dropna(axis='rows') clusters = [] with open("assets/englishClusters.log", "r") as f: clusters = json.loads(f.read()) df = assignment(df, clusters)[[ 'distance', 'sentiment', 'position', 'kmean', 'cluster_distance', 'cluster_sentiment', 'word_id', 'symbol' ]].dropna(axis='rows') return df.dropna(axis='rows'), filtred_news
def analyzeSentiment(sent_text: str): """ Takes in a body of text and returns compound, negative, neutral, and positive sentiment :param sent_text Body of text to analyze the sentiment of :returns The sentiment score of a text ranging from -1 to 1 """ # Get all non-stopwords if a dict of form {index: word} nonStopWords = tokenizeNonStopWords(sent_text) # All words tokenized tokens = word_tokenize(sent_text) # Determine if each word is in VADER corpus # if in corpus, leave it alone # if not in corpus, attempt lemmatizing to see if in corpus # Dictionary of all VADER words sia = SentimentIntensityAnalyzer() lex_dict = sia.make_lex_dict() lemmatizer = WordNetLemmatizer() for key, val in nonStopWords.items(): # Check if lexicon has this word hasWord = lex_dict.get(val, None) if hasWord == None: # Word not in lex_dict # Lemmatize assuming adjective and replace word tokens[key] = lemmatizer.lemmatize(val, 'a') # Detokenize to modified sentiment text detokenizer = MosesDetokenizer() mod_text = detokenizer.detokenize(tokens, return_str=True) # Get sentiment scores sentiment = sia.polarity_scores(mod_text) # Only interested in the compound score return sentiment['compound']
plt.title('Sentiment Analysis of Romney Corpus: HW04 vs HW06') plt.legend() plt.tight_layout() # Save plot to folder plt.savefig(os.path.join(path, 'ts_R1R2.svg')) ######################################################################## ######## P.0 ########################################################### # Create set of excludable characters for word cleaning, keep '!' exclude = set(punctuation.replace('!', '“”’—')) # Create list of special characters from VADER lexicon analyzer = SentimentIntensityAnalyzer() lex = analyzer.make_lex_dict() special = [l for l in lex if not l.isalnum()] # Open twitter dataset infile = gzip.open('HW04_twitterData.json.txt.gz', 'rt', encoding='utf-8') all_tweets = reader(infile) infile.close() ######## P.1 ########################################################### sentence1 = "Alicia and Ben still have the best , most romantic story arc." sentence2 = "Reed is a great scientist but a terrible husband (and father)!" analyzer = SentimentIntensityAnalyzer() result1 = analyzer.polarity_scores(sentence1) result2 = analyzer.polarity_scores(sentence2) LEN_LEXICON = len(analyzer.make_lex_dict())
vocab = model.vocab.keys() print (len(vocab), " the size of the vocabulary") print("embedding is done in %0.3fs." % (time() - t0)) t0 = time() model.init_sims(); used_emojis=vocab & emoji_map_back.keys() #print (used_emojis) vader_lex=sid.make_lex_dict() #print (type(vader_lex)) top10_list_word={} #top 20 meaningful words for emoji in used_emojis: sims = model.most_similar(emoji, topn=1000) temp=[] limit=20 for sim in sims: if limit<1: break if sim[0] not in used_emojis and sim[0] in vader_lex: temp.append(sim) limit-=1