def prelayer_processor(news_df):
    sia = SentimentIntensityAnalyzer()
    conca_, filtred_news = [], []
    news_df = news_df[['summary', 'time', 'symbol']]
    news_dic = {}
    for i in news_df.index:
        tmp_iloc = news_df.iloc[i]
        conca_ += tmp_iloc['summary']
        news_dic[i] = [tmp_iloc['time'], tmp_iloc['summary']]
    word_enumerate(conca_)
    news_idx = news_storing(news_dic)
    words_idx = []
    for article in range(len(news_idx)):
        tmp_ = freq_filter(news_idx[article][1])
        words_idx += tmp_
        tmp_word = []
        for i in tmp_:
            tmp_word.append(
                news_idx[article][0][news_idx[article][1].index(i)])
        news_idx[article][1] = tmp_word
        filtred_news.append(tmp_word)
    conca_sep = []
    for i in news_idx:
        conca_sep.append(list(i[1]))
    vader_sentiment_, distance_ = [], []
    conca_ = sum(conca_sep, [])
    words_ref = list(sia.make_lex_dict().keys())
    for i in conca_:
        distance_.append(absolute_distance(i))
        sent_ = sia.polarity_scores(i)['compound']
        if not sent_:
            sent_ = approximate_sentiment(i, words_ref, sia)
        vader_sentiment_.append(sent_)
    words_position = np.asarray(sum(
        [[round((j + 1) / len(i), 4) for j in range(len(i))]
         for i in conca_sep], []),
                                dtype='float64')
    symbols = sum([[i] * 8 for i in list(news_df['symbol'])], [])
    df = pd.DataFrame(
        np.array(
            [distance_, vader_sentiment_, words_position, words_idx,
             symbols]).transpose(),
        index=[i for i in conca_],
        columns=['distance', 'sentiment', 'position', 'word_id', 'symbol'])
    df['distance'] = pd.to_numeric(df['distance'], errors='coerce')
    df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')
    df['position'] = pd.to_numeric(df['position'], errors='coerce')
    df['word_id'] = pd.to_numeric(df['word_id'], errors='coerce')
    for col in list(df.columns)[:-1]:
        c_ = df[col]
        df[col] = (c_ - c_.mean()) / c_.std(ddof=0)
    df = df.dropna(axis='rows')
    clusters = []
    with open("assets/englishClusters.log", "r") as f:
        clusters = json.loads(f.read())
    df = assignment(df, clusters)[[
        'distance', 'sentiment', 'position', 'kmean', 'cluster_distance',
        'cluster_sentiment', 'word_id', 'symbol'
    ]].dropna(axis='rows')
    return df.dropna(axis='rows'), filtred_news
def analyzeSentiment(sent_text: str):
    """
    Takes in a body of text and returns compound, negative, neutral, and positive sentiment

    :param sent_text Body of text to analyze the sentiment of
    :returns The sentiment score of a text ranging from -1 to 1
    """

    # Get all non-stopwords if a dict of form {index: word}
    nonStopWords = tokenizeNonStopWords(sent_text)

    # All words tokenized
    tokens = word_tokenize(sent_text)

    # Determine if each word is in VADER corpus
    # if in corpus, leave it alone
    # if not in corpus, attempt lemmatizing to see if in corpus
    # Dictionary of all VADER words
    sia = SentimentIntensityAnalyzer()
    lex_dict = sia.make_lex_dict()
    lemmatizer = WordNetLemmatizer()
    for key, val in nonStopWords.items():
        # Check if lexicon has this word
        hasWord = lex_dict.get(val, None)
        if hasWord == None:
            # Word not in lex_dict
            # Lemmatize assuming adjective and replace word
            tokens[key] = lemmatizer.lemmatize(val, 'a')

    # Detokenize to modified sentiment text
    detokenizer = MosesDetokenizer()
    mod_text = detokenizer.detokenize(tokens, return_str=True)

    # Get sentiment scores
    sentiment = sia.polarity_scores(mod_text)

    # Only interested in the compound score
    return sentiment['compound']
Exemple #3
0
    plt.title('Sentiment Analysis of Romney Corpus: HW04 vs HW06')
    plt.legend()
    plt.tight_layout()
    # Save plot to folder
    plt.savefig(os.path.join(path, 'ts_R1R2.svg'))


########################################################################

######## P.0 ###########################################################
# Create set of excludable characters for word cleaning, keep '!'
exclude = set(punctuation.replace('!', '“”’—'))

# Create list of special characters from VADER lexicon
analyzer = SentimentIntensityAnalyzer()
lex = analyzer.make_lex_dict()
special = [l for l in lex if not l.isalnum()]

# Open twitter dataset
infile = gzip.open('HW04_twitterData.json.txt.gz', 'rt', encoding='utf-8')
all_tweets = reader(infile)
infile.close()

######## P.1 ###########################################################
sentence1 = "Alicia and Ben still have the best , most romantic story arc."
sentence2 = "Reed is a great scientist but a terrible husband (and father)!"

analyzer = SentimentIntensityAnalyzer()
result1 = analyzer.polarity_scores(sentence1)
result2 = analyzer.polarity_scores(sentence2)
LEN_LEXICON = len(analyzer.make_lex_dict())
Exemple #4
0
vocab = model.vocab.keys()

print (len(vocab), " the size of the vocabulary")
print("embedding is done in %0.3fs." % (time() - t0))


t0 = time()

model.init_sims();

used_emojis=vocab & emoji_map_back.keys()

#print (used_emojis)

vader_lex=sid.make_lex_dict()
#print (type(vader_lex))

top10_list_word={}
#top 20 meaningful words

for emoji in used_emojis:
    sims = model.most_similar(emoji, topn=1000)
    temp=[]
    limit=20
    for sim in sims:
        if limit<1:
            break
        if sim[0] not in used_emojis and sim[0] in vader_lex:
            temp.append(sim)
            limit-=1