Esempio n. 1
0
def train():
  positive_tweets = read_tweets('/root/295/new/positive.txt', 'positive')
  negative_tweets = read_tweets('/root/295/new/negative.txt', 'negative')
  print len(positive_tweets)
  print len(negative_tweets)

  #pos_train = positive_tweets[:2000]
  #neg_train = negative_tweets[:2000]
  #pos_test = positive_tweets[2001:3000]
  #neg_test = negative_tweets[2001:3000]
  pos_train = positive_tweets[:len(positive_tweets)*80/100]
  neg_train = negative_tweets[:len(negative_tweets)*80/100]
  pos_test = positive_tweets[len(positive_tweets)*80/100+1:]
  neg_test = negative_tweets[len(positive_tweets)*80/100+1:]

  training_data = pos_train + neg_train
  test_data = pos_test + neg_test

  sentim_analyzer = SentimentAnalyzer()
  all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_data])
  #print all_words_neg
  unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
  #print unigram_feats
  print len(unigram_feats)
  sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
  training_set = sentim_analyzer.apply_features(training_data)
  test_set = sentim_analyzer.apply_features(test_data)
  print test_set  
  trainer = NaiveBayesClassifier.train
  classifier = sentim_analyzer.train(trainer, training_set)
  for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))
  print sentim_analyzer.classify(tokenize_sentance('I hate driving car at night'))
  
  return sentim_analyzer
Esempio n. 2
0
    senti = line.split(",")[0]
    content = line[len(senti)+1:]
    tokens = word_tokenize(content.rstrip())
    trainingset.append((tokens,senti))
all_words_neg = sa.all_words([mark_negation(doc) for doc in trainingset])
unigram_feats = sa.unigram_word_feats(all_words_neg,min_freq = 4)
sa.add_feat_extractor(extract_unigram_feats,unigrams=unigram_feats)
training_set = sa.apply_features(trainingset)

for line in sys.stdin:
    if "username" in line:
        continue

    tweetWords=[]
    tweet= line.split(";")[4]
    likes = line.split(";")[3]
    likes = int(likes)
    if likes==0:
        num=1
    else:
        num = 1+likes

    words = tweet.split()
    for i in words:
        i = i.lower()
        i = i.strip('@#\'"?,.!')
        tweetWords.append(i)
    sentiment = sa.classify(tweetWords)
    print '%s\t%s' % (sentiment, str(num))

Esempio n. 3
0
    "southwestair": "Southwest Airlines",
    "delta": "Delta"
}

for airline_name in ["americanair", "united", "southwestair", "delta"]:
    with open("case_study_dataset_{}.csv".format(airline_name),
              "r") as file_handle:
        next(file_handle)  # Skip the header
        dataset = list()
        for line in file_handle.readlines():
            dataset.append(
                nltk.word_tokenize(
                    tweet_preprocessor.clean(html.unescape(line))))

    positive_sentiment_count = 0
    negative_sentiment_count = 0

    for tweet in dataset:
        sentiment_score = sentim_analyzer.classify(tweet)
        if sentiment_score == 0:
            negative_sentiment_count = negative_sentiment_count + 1
        else:
            positive_sentiment_count = positive_sentiment_count + 1

    results_dict["Airline"].append(airline_friendly_name_map[airline_name])
    results_dict["Positive Sentiment Count"].append(positive_sentiment_count)
    results_dict["Negative Sentiment Count"].append(negative_sentiment_count)

pd.DataFrame(results_dict).to_csv(
    "case_study_naive_bayes_classifier_with_emojis.csv")
print len(test), len(train)
sentiment_analyzer = SentimentAnalyzer()
all_words = sentiment_analyzer.all_words([doc[0] for doc in train])

# # Get list of terms+frequencies
# words_freqs = {}
# for tweet in train:
# 	for token in tweet[0]:
# 		if token in words_freqs:
# 			words_freqs[token] += 1
# 		else:
# 			words_freqs[token] = 1

# unigrams = [token for token in words_freqs if words_freqs[token] >= 4]
unigrams = sentiment_analyzer.unigram_word_feats(all_words, min_freq=4)
#bigrams = sentiment_analyzer.bigram_collocation_feats([doc[0] for doc in train], top_n=1000)
# print unigrams

sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigrams)
#sentiment_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigrams)

training_set=sentiment_analyzer.apply_features(train)
test_set=sentiment_analyzer.apply_features(test)
#print training_set[0]
trainer = NaiveBayesClassifier.train
classifier = sentiment_analyzer.train(trainer, training_set)
save_file(sentiment_analyzer, "sentiment_classifier.pkl")
for key,value in sorted(sentiment_analyzer.evaluate(test_set).items()):
	print("{0}: {1}".format(key,value))
print test[0], sentiment_analyzer.classify(test[0][0])
Esempio n. 5
0
class DiplomaSentimentAnalyzer:
    def __init__(self, n_instances=500):
        self.n_instances = n_instances
        self.subj_classifier = None
        self.sentim_analyzer = None
        try:
            BASE_DIR = os.path.dirname(
                os.path.dirname(os.path.abspath(__file__)))
            with open(os.path.join(BASE_DIR, 'main\my_classifier.pickle'),
                      'rb') as f:
                sentim_analyzer = pickle.load(f)
                self.sentim_analyzer = sentim_analyzer
        except IOError:
            with open('plot.tok.gt9.5000') as obj_sents:
                obj_sents = obj_sents.read()
            with open('quote.tok.gt9.5000') as subj_sents:
                subj_sents = subj_sents.read()
            self.obj_sents = obj_sents
            self.sentim_analyzer = SentimentAnalyzer()
            self.train_diploma(subj_sents, obj_sents)

    def prepair_train_data(self, sents, category):
        stop_words = set(stopwords.words('english'))
        sents_processed = sent_tokenize(sents)
        sents_final = []
        for k in sents_processed:
            sent = word_tokenize(k)
            sent = [w.lower() for w in sent if w not in stop_words]
            lemmatizer = WordNetLemmatizer()
            sent = [lemmatizer.lemmatize(j) for j in sent]
            sents_final.append(sent)
        sents_final = [(i, category) for i in sents_final]
        return sents_final

    def train_diploma(self, subj_sents, obj_sents):
        train_subj = self.prepair_train_data(subj_sents,
                                             'subj')[:self.n_instances]
        train_obj = self.prepair_train_data(obj_sents,
                                            'obj')[:self.n_instances]
        training_docs = train_subj + train_obj
        all_words_neg = self.sentim_analyzer.all_words(
            [mark_negation(doc) for doc in training_docs])
        unigram_feats = self.sentim_analyzer.unigram_word_feats(all_words_neg,
                                                                min_freq=4)
        self.sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                                unigrams=unigram_feats)
        training_set = self.sentim_analyzer.apply_features(training_docs)
        trainer = NaiveBayesClassifier.train
        subj_classifier = self.sentim_analyzer.train(trainer, training_set)
        self.subj_classifier = subj_classifier
        f = open('my_classifier.pickle', 'wb')
        pickle.dump(self.sentim_analyzer, f)
        f.close()

    def get_sentiment_values(self, text):
        result = {}
        if text:
            stop_words = set(stopwords.words('english'))
            sentences = word_tokenize(text)
            sentences = [w.lower() for w in sentences if w not in stop_words]
            lemmatizer = WordNetLemmatizer()
            sentences = [lemmatizer.lemmatize(j) for j in sentences]
            subj_count = 0
            for i in sentences:
                a = self.sentim_analyzer.classify({i, True})
                if a == 'subj':
                    subj_count += 1

            result['subjectivity'] = round(
                (subj_count / len(sentences)), 2) * 100
            sid = SentimentIntensityAnalyzer()
            polarity = sid.polarity_scores(text)['compound']
            result['polarity'] = (polarity / 2 + 0.5) * 100
            return result
print len(test), len(train)
sentiment_analyzer = SentimentAnalyzer()
all_words = sentiment_analyzer.all_words([doc[0] for doc in train])

# # Get list of terms+frequencies
# words_freqs = {}
# for tweet in train:
# 	for token in tweet[0]:
# 		if token in words_freqs:
# 			words_freqs[token] += 1
# 		else:
# 			words_freqs[token] = 1

# unigrams = [token for token in words_freqs if words_freqs[token] >= 4]
unigrams = sentiment_analyzer.unigram_word_feats(all_words, min_freq=4)
#bigrams = sentiment_analyzer.bigram_collocation_feats([doc[0] for doc in train], top_n=1000)
# print unigrams

sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigrams)
#sentiment_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigrams)

training_set = sentiment_analyzer.apply_features(train)
test_set = sentiment_analyzer.apply_features(test)
#print training_set[0]
trainer = NaiveBayesClassifier.train
classifier = sentiment_analyzer.train(trainer, training_set)
save_file(sentiment_analyzer, "sentiment_classifier.pkl")
for key, value in sorted(sentiment_analyzer.evaluate(test_set).items()):
    print("{0}: {1}".format(key, value))
print test[0], sentiment_analyzer.classify(test[0][0])
Esempio n. 7
0
for line in f:
    senti = line.split(",")[0]
    content = line[len(senti) + 1:]
    tokens = word_tokenize(content.rstrip())
    trainingset.append((tokens, senti))
all_words_neg = sa.all_words([mark_negation(doc) for doc in trainingset])
unigram_feats = sa.unigram_word_feats(all_words_neg, min_freq=4)
sa.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sa.apply_features(trainingset)

for line in sys.stdin:
    if "username" in line:
        continue

    tweetWords = []
    tweet = line.split(";")[4]
    likes = line.split(";")[3]
    likes = int(likes)
    if likes == 0:
        num = 1
    else:
        num = 1 + likes

    words = tweet.split()
    for i in words:
        i = i.lower()
        i = i.strip('@#\'"?,.!')
        tweetWords.append(i)
    sentiment = sa.classify(tweetWords)
    print '%s\t%s' % (sentiment, str(num))
class SuicideClassifier(object):

    def __init__(self, sentiment_only, num_phrases_to_track=20):
        # neg_phrases = filter_negative_phrases(load_csv_sentences('thoughtsandfeelings.csv'))
        # pos_phrases = filter_positive_phrases(load_csv_sentences('spiritualforums.csv'))
        # file_pos = open("pos_phrases.txt", 'w')
        # file_neg = open("neg_phrases.txt", 'w')

        # for item in pos_phrases:
        #     print>>file_pos, item
        # for item in neg_phrases:
        #     print>>file_neg, item
        self.recent_sentiment_scores = []

        neg_file = open("ALL_neg_phrases_filtered.txt", "r")
        pos_file = open("webtext_phrases_with_lots_of_words.txt", "r")
        neg_phrases = neg_file.readlines()
        pos_phrases = pos_file.readlines()

        neg_docs = []
        pos_docs = []
        for phrase in neg_phrases:
            neg_docs.append((phrase.split(), 'suicidal'))
        for phrase in pos_phrases[:len(neg_phrases)]:
            pos_docs.append((phrase.split(), 'alright'))

        print len(neg_docs)
        print len(pos_docs)
        # negcutoff = len(neg_docs) * 3 / 4
        # poscutoff = len(pos_docs) * 3 / 4
        negcutoff = -200
        poscutoff = -200

        train_pos_docs = pos_docs[:poscutoff]
        test_pos_docs = pos_docs[poscutoff:]
        train_neg_docs = neg_docs[:negcutoff]
        test_neg_docs = neg_docs[negcutoff:]
        training_docs = train_pos_docs + train_neg_docs
        testing_docs = test_pos_docs + test_neg_docs

        self.sentim_analyzer = SentimentAnalyzer()

        if not sentiment_only:
            all_words = self.sentim_analyzer.all_words([doc for doc in training_docs])
            unigram_feats = self.sentim_analyzer.unigram_word_feats(all_words, min_freq=1)
            self.sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

        self.sentim_analyzer.add_feat_extractor(vader_sentiment_feat)

        # bigram_feats = self.sentim_analyzer.bigram_collocation_feats(all_words, min_freq=1)
        # self.sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_feats)

        training_set = self.sentim_analyzer.apply_features(training_docs)
        test_set = self.sentim_analyzer.apply_features(testing_docs)
        trainer = NaiveBayesClassifier.train
        self.classifier = self.sentim_analyzer.train(trainer, training_set)
        for key, value in sorted(self.sentim_analyzer.evaluate(test_set).items()):
            print('{0}: {1}'.format(key, value))
        self.classifier.show_most_informative_features(20)

    def test(self, phrase):
        return self.sentim_analyzer.classify(phrase.split())

    def update_sentiments(self, value):
        now = datetime.datetime.now()
        self.recent_sentiment_scores.append([now, value])
        self.recent_sentiment_scores = [x for x in self.recent_sentiment_scores if x[
            0] > now - datetime.timedelta(seconds=60)]
        print sum([x[1] for x in self.recent_sentiment_scores]) / len(self.recent_sentiment_scores)
        return sum([x[1] for x in self.recent_sentiment_scores]) / len(self.recent_sentiment_scores)