Esempio n. 1
0
def readFromTweet():
    dataset = pd.read_csv(r'CSVs/tweet.csv')

    reviews = np.array(dataset['text'])
    dates = np.array(dataset['created_at'])
    test_reviews = reviews
    test_dates = dates

    sample_review_ids = [430, 200, 470]

    # normalize dataset

    # Starting to normalize the data
    norm_test_reviews = tn.normalize_corpus(test_reviews, html_stripping=False)
    # End of normalizing

    awesome = list(swn.senti_synsets('awesome', 'a'))[0]
    print('Positive Polarity Score:', awesome.pos_score())
    print('Negative Polarity Score:', awesome.neg_score())
    print('Objective Score:', awesome.obj_score())

    predicted_sentiments = [
        analyze_sentiment_sentiwordnet_lexicon(review, verbose=False)
        for review in norm_test_reviews
    ]

    for s in predicted_sentiments:
        print(s)
    generatScoreCsv(predicted_sentiments, test_dates)
Esempio n. 2
0
def readFromCSV():
    dataset = pd.read_csv(r'CSVs/tweets_main.csv')

    reviews = np.array(dataset['review'])
    sentiments = np.array(dataset['sentiment'])

    # extract data for model evaluation
    test_reviews = reviews[:5000]
    test_sentiments = sentiments[:5000]
    #sample_review_ids = [7626, 3533, 13010]

    # normalize dataset
    norm_test_reviews = tn.normalize_corpus(test_reviews)

    predicted_sentiments = [
        analyze_sentiment_sentiwordnet_lexicon(review, verbose=False)
        for review in norm_test_reviews
    ]

    meu.display_model_performance_metrics(
        true_labels=test_sentiments,
        predicted_labels=predicted_sentiments,
        classes=['positive', 'negative'])
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

norm_train_reviews = tn.normalize_corpus(train_reviews, stopwords=stop_words)
norm_test_reviews = tn.normalize_corpus(test_reviews, stopwords=stop_words)

# Tokenize train & test datasets

tokenized_train = [tn.tokenizer.tokenize(text) for text in norm_train_reviews]
tokenized_test = [tn.tokenizer.tokenize(text) for text in norm_test_reviews]

# Build Vocabulary Mapping (word to index)

from collections import Counter

# build word to index vocabulary
token_counter = Counter([token for review in tokenized_train for token in review])
vocab_map = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}
max_index = np.max(list(vocab_map.values()))
dataset = pd.read_csv(r'movie_reviews.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

# # Extract features from positive and negative reviews

# In[3]:

from sklearn.feature_extraction.text import TfidfVectorizer

# consolidate all normalized reviews
norm_reviews = norm_train_reviews + norm_test_reviews
# get tf-idf features for only positive reviews
positive_reviews = [
    review for review, sentiment in zip(norm_reviews, sentiments)
    if sentiment == 'positive'
]
Esempio n. 5
0
from nltk.corpus import gutenberg
import text_normalizer as tn
import nltk
from operator import itemgetter

Loading corpus, Alice in the Wonderland.

# load corpus
alice = gutenberg.sents(fileids='carroll-alice.txt')
# concatenate each word token of a sentence
alice = [' '.join(ts) for ts in alice]
# normalize text
# `filter()` removes tokens that are False after normalization
norm_alice = list(filter(None, 
                         tn.normalize_corpus(alice, text_lemmatization=False))) 

Compare raw texts vs, noramlized texts:

print(alice[0], '\n', norm_alice[0])

### N-grams

A function to create n-grams.

def compute_ngrams(sequence, n):
    return list(
        zip(*(sequence[index:]
               for index in range(n)))
    )
dataset = pd.read_csv(r'movie_reviews.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)


# # Extract features from positive and negative reviews

# In[3]:

from sklearn.feature_extraction.text import TfidfVectorizer

# consolidate all normalized reviews
norm_reviews = norm_train_reviews+norm_test_reviews
# get tf-idf features for only positive reviews
positive_reviews = [review for review, sentiment in zip(norm_reviews, sentiments) if sentiment == 'positive']
ptvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ptvf_features = ptvf.fit_transform(positive_reviews)
        overall_2013 += pred[0]
    if date[:4] == "2012":
        overall_2012 += pred[0]
    print('-' * 60)

predicted_sentiments = [
    analyze_sentiment_sentiwordnet_lexicon(review, verbose=False)[1]
    for review in test_reviews
]

meu.display_model_performance_metrics(true_labels=test_sentiments,
                                      predicted_labels=predicted_sentiments,
                                      classes=['positive', 'negative'])

# Plotting using the calculated overalls
years = ('2012', '2013', '2014', '2015', '2016', '2017')
x_pos = np.arange(len(years))
overall = [
    overall_2012, overall_2013, overall_2014, overall_2015, overall_2016,
    overall_2017
]
plt.bar(x_pos, overall, align='center', color='b')
plt.xticks(x_pos, years)
plt.ylabel('Overall')
plt.title("Obama's Tweets during 2012-2017")

plt.show()

# normalize dataset
norm_test_reviews = tn.normalize_corpus(test_tweets)
    for file in second_folder:
        data = pd.read_excel('datasets/combined/'+folder+'/'+file)

#     for file in elect_results_dir:    
#         data = pd.read_excel('datasets/combined/elect_results/'+file, index_col=[0])
# #data = data.drop(data.columns[0], axis=1)
# data = pd.read_excel('datasets/combined/day_one/election_day_one_combined.xlsx', index_col=[0])
    # for file in day_three_dir:    
    #     data = pd.read_excel('datasets/combined/day_three/'+file, index_col=[0])
        # Separate tweets from retweets and then normalize/
        mask = data['tweet_type'] == 'tweet'
        reg_tweets = data[mask]
        #reg_tweets = data[~data['tweet_text'].str.contains('RT')]
        # reg_tweets['tweet_text'] = [tweet['tweet_text'] for i, tweet in tweets_senti.iterrows() if tweet['tweet_type'] == 'tweet']
        # reg_tweets['vader_sentiment'] = [tweet['vd__polarity_sentiment'] for i, tweet in tweets_senti.iterrows() if tweet['tweet_type'] == 'tweet']
        reg_tweets['norm_tweets'] = tn.normalize_corpus(reg_tweets['tweet_text'], stopwords=stop_words)
        
        tweets = [[],[],[]]
        
        # Create lists for subtopics for positive, negative, and neutral vader sentiments.
        tweets[0] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'positive']
        tweets[1] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'negative']
        tweets[2] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'neutral']
        
        tweets_senti = [[],[],[]]
        tweets_senti[0] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'positive']
        tweets_senti[1] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'negative']
        tweets_senti[2] = [sentiment['norm_tweets'] for review, sentiment in reg_tweets.iterrows() if sentiment['vd__polarity_sentiment'] == 'neutral']
        new_df = pd.DataFrame()
        
        # Loop through the data and run through the topic model.
Esempio n. 9
0
                                                 predicted_labels,
                                                 average='weighted'),
                            4), np.round(
                                metrics.f1_score(true_labels,
                                                 predicted_labels,
                                                 average='weighted'), 4)


data_df = tn.create_df_from_input_labeled('labeled2')

# normalize data
norm_corpus, emoji, timecode = tn.normalize_corpus(
    corpus=data_df['Comment'],
    extract_timecodes=True,
    special_char_removal=True,
    use_emoji=True,
    repeated_characters_remover=True,
    text_lower_case=True,
    stop_words_remover=True,
    text_lemmatization=True)
data_df['Clean_Comment'] = norm_corpus
data_df['Emoji'] = emoji
data_df['TimeCodes'] = timecode

print("Cleanned comments:\n", data_df['Clean_Comment'])
print("Data shape", data_df.shape)

# find empty documents in dataset and remove them
total_nulls = data_df[data_df.Clean_Comment.str.strip() == ''].shape[0]
print("Empty documents:", total_nulls)
print("Data shape before removing empty documents:", data_df.shape)
Esempio n. 10
0
total_nulls = data_df[data_df['review'].str.strip() == ""].shape[0]
print("Empty documents:", total_nulls)

# +
import nltk
stopword_list = nltk.corpus.stopwords.words('english')
# just to keep negation if any in bi-grams
stopword_list.remove('no')
stopword_list.remove('not')

norm_corpus = tn.normalize_corpus(corpus=data_df['review'],
                                  html_stripping=True,
                                  contraction_expansion=True,
                                  accented_char_removal=True,
                                  text_lower_case=True,
                                  text_lemmatization=True,
                                  text_stemming=False,
                                  special_char_removal=True,
                                  remove_digits=True,
                                  stopword_removal=True,
                                  stopwords=stopword_list)
data_df['clean review'] = norm_corpus
# -

data_df.head()

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data_df['sentiment'])

le.classes_