def readFromTweet(): dataset = pd.read_csv(r'CSVs/tweet.csv') reviews = np.array(dataset['text']) dates = np.array(dataset['created_at']) test_reviews = reviews test_dates = dates sample_review_ids = [430, 200, 470] # normalize dataset # Starting to normalize the data norm_test_reviews = tn.normalize_corpus(test_reviews, html_stripping=False) # End of normalizing awesome = list(swn.senti_synsets('awesome', 'a'))[0] print('Positive Polarity Score:', awesome.pos_score()) print('Negative Polarity Score:', awesome.neg_score()) print('Objective Score:', awesome.obj_score()) predicted_sentiments = [ analyze_sentiment_sentiwordnet_lexicon(review, verbose=False) for review in norm_test_reviews ] for s in predicted_sentiments: print(s) generatScoreCsv(predicted_sentiments, test_dates)
def readFromCSV(): dataset = pd.read_csv(r'CSVs/tweets_main.csv') reviews = np.array(dataset['review']) sentiments = np.array(dataset['sentiment']) # extract data for model evaluation test_reviews = reviews[:5000] test_sentiments = sentiments[:5000] #sample_review_ids = [7626, 3533, 13010] # normalize dataset norm_test_reviews = tn.normalize_corpus(test_reviews) predicted_sentiments = [ analyze_sentiment_sentiwordnet_lexicon(review, verbose=False) for review in norm_test_reviews ] meu.display_model_performance_metrics( true_labels=test_sentiments, predicted_labels=predicted_sentiments, classes=['positive', 'negative'])
reviews = np.array(dataset['review']) sentiments = np.array(dataset['sentiment']) # build train and test datasets train_reviews = reviews[:35000] train_sentiments = sentiments[:35000] test_reviews = reviews[35000:] test_sentiments = sentiments[35000:] # normalize datasets stop_words = nltk.corpus.stopwords.words('english') stop_words.remove('no') stop_words.remove('but') stop_words.remove('not') norm_train_reviews = tn.normalize_corpus(train_reviews, stopwords=stop_words) norm_test_reviews = tn.normalize_corpus(test_reviews, stopwords=stop_words) # Tokenize train & test datasets tokenized_train = [tn.tokenizer.tokenize(text) for text in norm_train_reviews] tokenized_test = [tn.tokenizer.tokenize(text) for text in norm_test_reviews] # Build Vocabulary Mapping (word to index) from collections import Counter # build word to index vocabulary token_counter = Counter([token for review in tokenized_train for token in review]) vocab_map = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())} max_index = np.max(list(vocab_map.values()))
dataset = pd.read_csv(r'movie_reviews.csv') # take a peek at the data print(dataset.head()) reviews = np.array(dataset['review']) sentiments = np.array(dataset['sentiment']) # build train and test datasets train_reviews = reviews[:35000] train_sentiments = sentiments[:35000] test_reviews = reviews[35000:] test_sentiments = sentiments[35000:] # normalize datasets norm_train_reviews = tn.normalize_corpus(train_reviews) norm_test_reviews = tn.normalize_corpus(test_reviews) # # Extract features from positive and negative reviews # In[3]: from sklearn.feature_extraction.text import TfidfVectorizer # consolidate all normalized reviews norm_reviews = norm_train_reviews + norm_test_reviews # get tf-idf features for only positive reviews positive_reviews = [ review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive' ]
from nltk.corpus import gutenberg import text_normalizer as tn import nltk from operator import itemgetter Loading corpus, Alice in the Wonderland. # load corpus alice = gutenberg.sents(fileids='carroll-alice.txt') # concatenate each word token of a sentence alice = [' '.join(ts) for ts in alice] # normalize text # `filter()` removes tokens that are False after normalization norm_alice = list(filter(None, tn.normalize_corpus(alice, text_lemmatization=False))) Compare raw texts vs, noramlized texts: print(alice[0], '\n', norm_alice[0]) ### N-grams A function to create n-grams. def compute_ngrams(sequence, n): return list( zip(*(sequence[index:] for index in range(n))) )
dataset = pd.read_csv(r'movie_reviews.csv') # take a peek at the data print(dataset.head()) reviews = np.array(dataset['review']) sentiments = np.array(dataset['sentiment']) # build train and test datasets train_reviews = reviews[:35000] train_sentiments = sentiments[:35000] test_reviews = reviews[35000:] test_sentiments = sentiments[35000:] # normalize datasets norm_train_reviews = tn.normalize_corpus(train_reviews) norm_test_reviews = tn.normalize_corpus(test_reviews) # # Extract features from positive and negative reviews # In[3]: from sklearn.feature_extraction.text import TfidfVectorizer # consolidate all normalized reviews norm_reviews = norm_train_reviews+norm_test_reviews # get tf-idf features for only positive reviews positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive'] ptvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True) ptvf_features = ptvf.fit_transform(positive_reviews)
overall_2013 += pred[0] if date[:4] == "2012": overall_2012 += pred[0] print('-' * 60) predicted_sentiments = [ analyze_sentiment_sentiwordnet_lexicon(review, verbose=False)[1] for review in test_reviews ] meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, classes=['positive', 'negative']) # Plotting using the calculated overalls years = ('2012', '2013', '2014', '2015', '2016', '2017') x_pos = np.arange(len(years)) overall = [ overall_2012, overall_2013, overall_2014, overall_2015, overall_2016, overall_2017 ] plt.bar(x_pos, overall, align='center', color='b') plt.xticks(x_pos, years) plt.ylabel('Overall') plt.title("Obama's Tweets during 2012-2017") plt.show() # normalize dataset norm_test_reviews = tn.normalize_corpus(test_tweets)
for file in second_folder: data = pd.read_excel('datasets/combined/'+folder+'/'+file) # for file in elect_results_dir: # data = pd.read_excel('datasets/combined/elect_results/'+file, index_col=[0]) # #data = data.drop(data.columns[0], axis=1) # data = pd.read_excel('datasets/combined/day_one/election_day_one_combined.xlsx', index_col=[0]) # for file in day_three_dir: # data = pd.read_excel('datasets/combined/day_three/'+file, index_col=[0]) # Separate tweets from retweets and then normalize/ mask = data['tweet_type'] == 'tweet' reg_tweets = data[mask] #reg_tweets = data[~data['tweet_text'].str.contains('RT')] # reg_tweets['tweet_text'] = [tweet['tweet_text'] for i, tweet in tweets_senti.iterrows() if tweet['tweet_type'] == 'tweet'] # reg_tweets['vader_sentiment'] = [tweet['vd__polarity_sentiment'] for i, tweet in tweets_senti.iterrows() if tweet['tweet_type'] == 'tweet'] reg_tweets['norm_tweets'] = tn.normalize_corpus(reg_tweets['tweet_text'], stopwords=stop_words) tweets = [[],[],[]] # Create lists for subtopics for positive, negative, and neutral vader sentiments. tweets[0] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'positive'] tweets[1] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'negative'] tweets[2] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'neutral'] tweets_senti = [[],[],[]] tweets_senti[0] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'positive'] tweets_senti[1] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'negative'] tweets_senti[2] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'neutral'] new_df = pd.DataFrame() # Loop through the data and run through the topic model.
predicted_labels, average='weighted'), 4), np.round( metrics.f1_score(true_labels, predicted_labels, average='weighted'), 4) data_df = tn.create_df_from_input_labeled('labeled2') # normalize data norm_corpus, emoji, timecode = tn.normalize_corpus( corpus=data_df['Comment'], extract_timecodes=True, special_char_removal=True, use_emoji=True, repeated_characters_remover=True, text_lower_case=True, stop_words_remover=True, text_lemmatization=True) data_df['Clean_Comment'] = norm_corpus data_df['Emoji'] = emoji data_df['TimeCodes'] = timecode print("Cleanned comments:\n", data_df['Clean_Comment']) print("Data shape", data_df.shape) # find empty documents in dataset and remove them total_nulls = data_df[data_df.Clean_Comment.str.strip() == ''].shape[0] print("Empty documents:", total_nulls) print("Data shape before removing empty documents:", data_df.shape)
total_nulls = data_df[data_df['review'].str.strip() == ""].shape[0] print("Empty documents:", total_nulls) # + import nltk stopword_list = nltk.corpus.stopwords.words('english') # just to keep negation if any in bi-grams stopword_list.remove('no') stopword_list.remove('not') norm_corpus = tn.normalize_corpus(corpus=data_df['review'], html_stripping=True, contraction_expansion=True, accented_char_removal=True, text_lower_case=True, text_lemmatization=True, text_stemming=False, special_char_removal=True, remove_digits=True, stopword_removal=True, stopwords=stopword_list) data_df['clean review'] = norm_corpus # - data_df.head() from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(data_df['sentiment']) le.classes_