def __init__(self): #document represented by a tuple (sentence,labelt) n_instances = 100 subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] #split subj and objinstances to keep a balanced uniform class distribution in both train and test sets. train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs+train_obj_docs testing_docs = test_subj_docs+test_obj_docs #train classifier sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) #use simple unigram word features, handling negation unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) #apply features to obtain a feature_value representations of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) self.trainer = NaiveBayesClassifier.train self.classifier = sentim_analyzer.train(self.trainer, training_set) for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value)) self.sid = SentimentIntensityAnalyzer()
def train(): subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] ] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) for key, value in sorted(sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value))
def trainSubjectivity(): # Subjective vs. objective sentence classifier. Borrows from NLTK Documentation. # Plan on using it in larger machine learning sentiment model as pre-processing # Must differentiate between objective and subjective subjDocs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')] objDocs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')] nSubj = len(subjDocs) nObj = len(objDocs) # 90% Training, 10% Test subjTrain = int(.9 * nSubj) objTrain = int(.9 * nObj) trainSubj = subjDocs[:subjTrain] testSubj = subjDocs[subjTrain:nSubj] trainObj = objDocs[:objTrain] testObj = objDocs[objTrain:nObj] trainDocs = trainSubj + trainObj testDocs = testSubj + testObj # Create sentiment class, mark negation, create features (unigram) sentiment = SentimentAnalyzer() markNegation = sentiment.all_words([mark_negation(doc) for doc in trainDocs]) unigramFeats = sentiment.unigram_word_feats(markNegation, min_freq=4) sentiment.add_feat_extractor(extract_unigram_feats, unigrams=unigramFeats) training = sentiment.apply_features(trainDocs) testing = sentiment.apply_features(testDocs) # Train classifier trainer = NaiveBayesClassifier.train subjectivityClassifier = sentiment.train(trainer, training) joblib.dump(subjectivityClassifier, 'subjectivity.pkl') for key, value in sorted(sentiment.evaluate(testing).items()): print('{0}: {1}'.format(key, value)) '''
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None): """ Train and test a classifier on instances of the Subjective Dataset by Pang and Lee. The dataset is made of 5000 subjective and 5000 objective sentences. All tokens (words and punctuation marks) are separated by a whitespace, so we use the basic WhitespaceTokenizer to parse the data. :param trainer: `train` method of a classifier. :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. :param n_instances: the number of total sentences that have to be used for training and testing. Sentences will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.sentiment import SentimentAnalyzer from nltk.corpus import subjectivity if n_instances is not None: n_instances = int(n_instances/2) subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_subj_docs, test_subj_docs = split_train_test(subj_docs) train_obj_docs, test_obj_docs = split_train_test(obj_docs) training_docs = train_subj_docs+train_obj_docs testing_docs = test_subj_docs+test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) # Add simple unigram word features handling negation unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if save_analyzer == True: save_file(sentim_analyzer, 'sa_subjectivity.pickle') if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__, Tokenizer='WhitespaceTokenizer', Feats=extr, Instances=n_instances, Results=results) return sentim_analyzer
def GetTrainDataSetForNLTK(self, instances=100): subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:instances] ] obj_docs = [ (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:instances] ] train_subj_docs = subj_docs train_obj_docs = obj_docs trainSet = train_subj_docs + train_obj_docs return trainSet
def sentiment_analysis(data): from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * n_instances = 100 subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] ] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) for key, value in sorted(sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value)) from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk import tokenize sid = SentimentIntensityAnalyzer() for line in data: ss = sid.polarity_scores(line['line_text']) line['compound'] = ss['compound'] line['neg'] = ss['neg'] line['pos'] = ss['pos'] line['neu'] = ss['neu']
def train_sentiment(): instances = 8000 subj = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:instances]] obj = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:instances]] train_subj = subj train_obj = obj train_set = train_subj + train_obj sentiment = SentimentAnalyzer() all_neg = sentiment.all_words([mark_negation(doc) for doc in train_set]) uni_g = sentiment.unigram_word_feats(all_neg, min_freq=4) sentiment.add_feat_extractor(extract_unigram_feats, unigrams=uni_g) trained_set = sentiment.apply_features(train_set) nb = NaiveBayesClassifier.train classifier = sentiment.train(nb, trained_set) return classifier
def __init__(self, load=True, set_defaults=True): # Set global defaults self.data_source = "papers.csv" self.source = self.keywords = self.authors = self.date = self.text = self.title = [] # Preload data from master file if load == True: self.load_data() # Prepare default training data for sentiment analysis if set_defaults == True: training_subjective = [(sentences, "subj") for sentences in sub.sents(categories="subj")[:500]] training_objective = [(sentences, "obj") for sentences in sub.sents(categories="obj")[:500]] self.training_data = training_objective + training_subjective print ("NewsAnalyzer created")
def subjectivity_classifier(): from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * """ Initializes and trains categorical subjectivity analyzer """ N_INSTANCES = 100 subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:N_INSTANCES] ] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:N_INSTANCES]] train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sent_analyzer = SentimentAnalyzer() all_words_neg = sent_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = sent_analyzer.unigram_word_feats(all_words_neg, min_freq=4) print(f"unigram feats: {len(unigram_feats)}") sent_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sent_analyzer.apply_features(training_docs) test_set = sent_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sent_analyzer.train(trainer, training_set) for k, v in sorted(sent_analyzer.evaluate(test_set).items()): print(f"{k}: {v}") return sent_analyzer
def analyze_sentiment(paragraph): n_instances = 100 subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] ] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) sid = SentimentIntensityAnalyzer() total_sum = 0 count = 0.0 sentences = sent_tokenize(paragraph) for sentence in sentences: total_sum += sid.polarity_scores(sentence)["compound"] count += 1 return total_sum * 10 / count
def get_objectivity_analyzer(): n_instances = 100 subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] train_subj_docs = subj_docs train_obj_docs = obj_docs training_docs = train_subj_docs+train_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) trainer = NaiveBayesClassifier.train sentiment_classifier = sentim_analyzer.train(trainer, training_set) return sentim_analyzer
def build_NBclassifier(): ''' The Subjectivity Dataset contains 5000 subjective and 5000 objective processed sentences. gets 91% accuracy on 1000 sentence test set. ''' random.seed(1) subjective_sents = subjectivity.sents(categories='subj') objective_sents = subjectivity.sents(categories='obj') sents = [] for sent in subjective_sents: sents.append((sent, 'subj')) for sent in objective_sents: sents.append((sent, 'obj')) random.shuffle(sents) train = sents cl = NaiveBayesClassifier(train) # save the model to disk filename = 'NB_Subj_Model.sav' pickle.dump(cl, open(filename, 'wb'))
class SentimentAnalysis(object): instance_items = 100 subjects = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:instance_items]] objects = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:instance_items]] subject_train = subjects[:80] subject_test = subjects[80:100] object_train = objects[:80] object_test = objects[80:100] training_docs = subject_train+object_train testing_docs = subject_test+object_test sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) def Analysis_Result(file_url): file = open(file_url, "r") content = file.read() content_list = tokenize.sent_tokenize(content) print("Analiz Başladı \n") print(content_list) print("") file.close() sentiment_intensity = SentimentIntensityAnalyzer() for line in content_list: print(line) sentiment_score = sentiment_intensity.polarity_scores(line) for k in sorted(sentiment_score): print('{0}: {1}, '.format(k, sentiment_score[k]), end='') print()
def prepare_training_and_test_data(self): """ Each document is represented by a tuple (sentence, label). The sentence is tokenized, so it is represented by a list of strings. E.g: (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj') """ subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:self.n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:self.n_instances]] # We separately split subjective and objective instances to keep a balanced uniform class distribution in both train and test sets. training_end = self.n_training testing_start = training_end testing_end = testing_start + self.n_testing train_subj_docs = subj_docs[:training_end] test_subj_docs = subj_docs[testing_start:testing_end] train_obj_docs = obj_docs[:training_end] test_obj_docs = obj_docs[testing_start:testing_end] self.training_docs = train_subj_docs + train_obj_docs self.testing_docs = test_subj_docs + test_obj_docs
import nltk from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * n_instances = 100 subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] len(subj_docs), len(obj_docs) (100, 100) subj_docs[0] (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj') train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs+train_obj_docs testing_docs = test_subj_docs+test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=1) len(unigram_feats) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs)
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import subjectivity print(subjectivity.categories()) print(subjectivity.sents()[23]) print(subjectivity.words(categories='subj'))
def get_tweets(self, query, count=10): tweets = [] try: #get the tweets from twitter fetched_tweets = self.api.search(q=query, count=count) n_instances = 100 subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] ] obj_docs = [ (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances] ] train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs emotion_analyzer = SentimentAnalyzer() #get the negative words for feature extraction all_radical_slurs = emotion_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = emotion_analyzer.unigram_word_feats( all_radical_slurs, min_freq=4) emotion_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = emotion_analyzer.apply_features(training_docs) test_set = emotion_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = emotion_analyzer.train(trainer, training_set) #test sentences sentences = [ "Ravi is the worst boy in class", "The story is full of mean bitchy characters", "I had a good day!", "The day was okay", "The day was very bad", "Harry potter is a good book", "New Tata electric car is a piece of shit", "It has been a long time since I had a good food", "Stop acting as a asshole" ] sid = SentimentIntensityAnalyzer() for sentence in sentences: print(sentence) ss = sid.polarity_scores(sentence) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') print() for tweet in fetched_tweets: print(tweet.text) ss = sid.polarity_scores(tweet.text) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') print() return tweets except tweepy.TweepError as e: print("Error : " + str(e))
from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * n_instances = 5000 subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] print("Len subj_docs and obj docs: ", len(subj_docs), len(obj_docs), "\n", "\n") print("subj_docs: ", subj_docs, "\n", "obj_docs: ", obj_docs, "\n") print(len(subjectivity.sents(categories='subj')), "\n") train_subj_docs = subj_docs[:4980] test_subj_docs = subj_docs[4980:5000] train_obj_docs = obj_docs[:4980] test_obj_docs = obj_docs[4980:5000] training_docs = train_subj_docs+train_obj_docs testing_docs = test_subj_docs+test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) print(len(unigram_feats), "\n") sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set)
from nltk import tokenize from news import SimpleArticle # Resourcing a dataset nltk.download('subjectivity') print("Subjectivity ready") nltk.download('punkt') print("punkt ready") nltk.download('vader_lexicon') print("VADER ready") ## Generating dataset instances = 100 subjective_sentences = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:instances] ] objective_sentences = [ (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:instances] ] # Divied each dataset into 20% test, 80% train train_subjective = subjective_sentences[:80] test_subjective = subjective_sentences[80:] train_objective = objective_sentences[:80] test_objective = objective_sentences[80:] training_docs = train_objective + train_subjective testing_docs = test_objective + test_subjective
from features import word_features, document_features from nltk.corpus import subjectivity from nltk.corpus import words as wds import nltk ##Features numberWords = 100 words = word_features(subjectivity.words(), numberWords) f = words + wds.words(fileids='en-basic') ##Data Set subj = [(sentence, 'subj') for sentence in subjectivity.sents(categories='subj')] obj = [(sentence, 'obj') for sentence in subjectivity.sents(categories='obj')] length = len(subj) nintyPercent = int(length * .9) test_tokens = subj[:nintyPercent] + obj[:nintyPercent] train_tokens = subj[nintyPercent:] + obj[nintyPercent:] print("Test set length = " + str(len(test_tokens))) print("Train set length = " + str(len(train_tokens))) trainSet = [(document_features(sent, f), category) for (sent, category) in train_tokens] testSet = [(document_features(sent, f), category) for (sent, category) in test_tokens] #Train
# In[ ]: # In[46]: def divide_sets(labeled_sents): shuffle(labeled_sents) ratio = int(len(labeled_sents) * 0.8) train, test = labeled_sents[:ratio], labeled_sents[ratio:] return train, test # In[91]: labeled_sents = [(sent, label) for label in sub.categories() for sent in sub.sents(categories=label)] trainset, testset = divide_sets(labeled_sents) # In[92]: print(trainset[0]) print(testset[0]) # In[93]: lemtzer = WordNetLemmatizer() stopws = stopwords.words('english') fd = nltk.FreqDist( lemtzer.lemmatize(w) for sent, label in trainset for w in sent if w.isalnum() and w not in stopws) ftwords = list(fd)[:500]
from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * import pickle subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')] train_subj_docs = subj_docs[:2500] test_subj_docs = subj_docs[2500:] train_obj_docs = obj_docs[:2500] test_obj_docs = obj_docs[2500:] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) with open('senti_analyzer.pkl', 'wb') as f: pickle.dump(sentim_analyzer, f) with open('classifier.pkl', 'wb') as f: pickle.dump(classifier, f)
# The NLTK's collection of sentiment analysis tools # More info www.nltk.org from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * ## Build training and testing data sets # Size of dataset(s) n = 1000 # Get "n" subjective and objective phrases from subjectivity corpus subjective = [(sentences,'subj') for sentences in subjectivity.sents(categories='subj')[:n]] objective = [(sentences,'obj') for sentences in subjectivity.sents(categories='obj')[:n]] # Here's what the first item in "subjective" looks like # Note that it's stores as (phrase, label) subjective[0] # Create separate training and test data sets, this is pretty standard in any data mining/machine learning task # The typical split is, as seen here (training = 80%, train = 20%) training_subjective = subjective[:int(.8*n)] test_subjective = subjective[int(.8*n):n] training_objective = objective[:int(.8*n)] test_objective = objective[int(.8*n):n]
from nltk.classify import NaiveBayesClassifier from nltk.corpus import subjectivity from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * from nltk.sentiment.vader import SentimentIntensityAnalyzer n = 100 subj = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n]] obj = [(sent, 'subj') for sent in subjectivity.sents(categories='obj')[:n]] train_data = subj + obj sa = SentimentAnalyzer() neg_words = sa.all_words([mark_negation(doc) for doc in train_data]) uf = sa.unigram_word_feats(neg_words, min_freq=4) sa.add_feat_extractor(extract_unigram_feats, unigrams=uf) training_set = sa.apply_features(train_data) #test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sa.train(trainer, training_set) def getPolarity(): sia = SentimentIntensityAnalyzer() sia.polarity_scores(stri)
# The NLTK's collection of sentiment analysis tools # More info www.nltk.org from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * ## Build training and testing data sets # Size of dataset(s) n = 1000 # Get "n" subjective and objective phrases from subjectivity corpus subjective = [(sentences, "subj") for sentences in subjectivity.sents(categories="subj")[:n]] objective = [(sentences, "obj") for sentences in subjectivity.sents(categories="obj")[:n]] # Here's what the first item in "subjective" looks like # Note that it's stores as (phrase, label) subjective[0] # Create separate training and test data sets, this is pretty standard in any data mining/machine learning task # The typical split is, as seen here (training = 80%, train = 20%) training_subjective = subjective[: int(0.8 * n)] test_subjective = subjective[int(0.8 * n) : n] training_objective = objective[: int(0.8 * n)] test_objective = objective[int(0.8 * n) : n]
def training_setup(): ## Build training set from nltk subjectivity/objectivity corpora training_subjective = [(sentences,'subj') for sentences in subjectivity.sents(categories='subj')[:5000]] training_objective = [(sentences,'obj') for sentences in subjectivity.sents(categories='obj')[:5000]] training = training_objective + training_subjective return [training]
def train_sentiment_analyzer_subjectivity(n_instances=None): if n_instances is not None: n_instances = int(n_instances / 2) # NLTK's integrated and subjectivity dataset for the subj training subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] ] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_subj_docs, test_subj_docs = split_train_test(subj_docs) train_obj_docs, test_obj_docs = split_train_test(obj_docs) training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) stopwords_english = stopwords.words('english') punctuation = list(string.punctuation) punctuation.append("''") punctuation.append("``") punctuation.append("—") punctuation.append("…") punctuation.append("...") punctuation.append("--") punctuation.append("..") stopwords_english.extend(punctuation) all_words_clean = [] for word in all_words: if word not in stopwords_english and word not in string.digits: all_words_clean.append(word) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words_clean, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) testing_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: message = "Your classifier does not provide a show_most_informative_features() method." print(message) read_write.log_message(message) sentim_analyzer.evaluate(testing_set) classifier_accuracy_percent = (classify.accuracy(classifier, testing_set)) * 100 message_acc = 'Accuracy of classifier = ' + str( classifier_accuracy_percent) + '%' print(message_acc) read_write.log_message("[INFO]" + LOG_NAME + message_acc) save_file(sentim_analyzer, 'files/sa_subjectivity.pickle') message = "sa_subjectivity.pickle file saved." print(message) read_write.log_message(message)
bryant = gutenberg.sents('bryant-stories.txt') burgess = gutenberg.sents('burgess-busterbrown.txt') carroll = gutenberg.sents('carroll-alice.txt') ch_ball = gutenberg.sents('chesterton-ball.txt') ch_brown = gutenberg.sents('chesterton-brown.txt') ch_thurs = gutenberg.sents('chesterton-thursday.txt') edge = gutenberg.sents('edgeworth-parents.txt') mel = gutenberg.sents('melville-moby_dick.txt') mil = gutenberg.sents('milton-paradise.txt') caesar = gutenberg.sents('shakespeare-caesar.txt') hamlet = gutenberg.sents('shakespeare-hamlet.txt') macbeth = gutenberg.sents('shakespeare-macbeth.txt') whit = gutenberg.sents('whitman-leaves.txt') rural = abc.sents('rural.txt') science = abc.sents('science.txt') plots = subjectivity.sents('plot.tok.gt9.5000') quotes = subjectivity.sents('quote.tok.gt9.5000') austen = sense + emma + persuasion shakespeare = caesar + hamlet + macbeth facts = rural + science opinions = plots + quotes gute = bryant + burgess + carroll + edge + mel + mil + whit chester = ch_ball + ch_brown + ch_thurs total = austen + shakespeare + facts + opinions + gute + chester + b + sents #print(plots) #print(science) #print(bible) g = Word2Vec(total) g.wv.save_word2vec_format('model.bin', binary=True)
def plot(): global df global author1 global author2 global startDate global endDate global author1_wpm global author2_wpm global more_words global author1_messages global author2_messages global more_messages #restricting range of data mask = (df['Date'] >= startDate) & (df['Date'] <= endDate) df = df.loc[mask] author1_df = df.loc[(df['Author'] == author1)] author2_df = df.loc[(df['Author'] == author2)] author1_hour = author1_df['Hour'].value_counts() author2_hour = author2_df['Hour'].value_counts() def time_of_day_data(): hours_dictionary = {} hours_dictionary['hourlist'] = ['Author 1', 'Author 2'] for i in range(0, 24): t_list = [0, 0] j = str(i) if i < 10: j = '0' + j if i == 0: j = '00' if j in author1_hour.index.tolist(): t_list[0] = author1_hour.loc[j].item() if j in author2_hour.index.tolist(): t_list[1] = author2_hour.loc[j].item() hours_dictionary[j] = t_list for x in hours_dictionary: if x == 'hourlist': counter = 0 elif int(hours_dictionary[x][0]) > counter: counter = int(hours_dictionary[x][0]) elif int(hours_dictionary[x][1]) > counter: counter = int(hours_dictionary[x][1]) return hours_dictionary, counter def roundup(x): return int(x) if x % 100 == 0 else int(x + 100 - x % 100) ### start of FIRST: time of day ### def plot_time_of_day(): plt.style.use('fivethirtyeight') plt.style.use('bmh') plt.rcParams["font.family"] = "Gabriola" plt.rcParams.update({'font.size': 16}) tod_data, maxcount = time_of_day_data() time_of_day_df = pd.DataFrame(tod_data) maxcount = roundup(maxcount) + 200 a = roundup(maxcount / 4) b = roundup(maxcount / 2) c = roundup(3 * maxcount / 4) # No. of variable categories = list(time_of_day_df)[1:] N = len(categories) # What will be the angle of each axis in the plot? (we divide the plot / number of variable) angles = [n / float(N) * 2 * pi for n in range(N)] angles += angles[:1] # Initialise the spider plot ax = plt.subplot(111, polar=True, label='time of day') # If you want the first axis to be on top: ax.set_theta_offset(pi / 2) ax.set_theta_direction(-1) # Draw one axe per variable + add labels labels yet plt.xticks(angles[:-1], categories, fontsize=16) # Draw ylabels ax.set_rlabel_position(0) plt.yticks([a, b, c], [str(a), str(b), str(c)], color="grey", size=12) plt.ylim(0, maxcount) # Ind1 values = time_of_day_df.loc[0].drop( 'hourlist').values.flatten().tolist() values += values[:1] ax.plot(angles, values, linewidth=1, linestyle='solid', label=author1_name, color=author1_colour) ax.fill(angles, values, author1_colour, alpha=0.1) # Ind2 values = time_of_day_df.loc[1].drop( 'hourlist').values.flatten().tolist() values += values[:1] ax.plot(angles, values, linewidth=1, linestyle='solid', label=author2_name, color=author2_colour) ax.fill(angles, values, author2_colour, alpha=0.1) # Add legend plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1)) plt.savefig(os.path.join("uploads", 'timeofday.png'), bbox_inches='tight') plot_time_of_day() ### end of FIRST: time of day ### author1_day = author1_df['Day_of_week'].value_counts() author2_day = author2_df['Day_of_week'].value_counts() days_in_order = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ] def day_of_week_data(): day_dictionary = {} day_dictionary['Day'] = ['Author 1', 'Author 2'] for dayname in days_in_order: t_list = [0, 0] if dayname in author1_day.index.tolist(): t_list[0] = author1_day.loc[dayname].item() if dayname in author2_day.index.tolist(): t_list[1] = author2_day.loc[dayname].item() day_dictionary[dayname] = t_list for x in day_dictionary: if x == 'Day': counter = 0 else: temp = max(int(day_dictionary[x][0]), int(day_dictionary[x][1])) if temp > counter: counter = temp return day_dictionary, counter ### start of SECOND: Day of week ### def plot_day_of_week(): plt.style.use('fivethirtyeight') plt.style.use('bmh') plt.rcParams["font.family"] = "Gabriola" plt.rcParams.update({'font.size': 16}) dow_data, maxcount = day_of_week_data() day_of_week_df = pd.DataFrame(dow_data) maxcount = roundup(maxcount) + 200 a = roundup(maxcount / 4) b = roundup(maxcount / 2) c = roundup(3 * maxcount / 4) # number of variable categories = list(day_of_week_df)[1:] N = len(categories) # What will be the angle of each axis in the plot? (we divide the plot / number of variable) angles = [n / float(N) * 2 * pi for n in range(N)] angles += angles[:1] # Initialise the spider plot ax = plt.subplot(111, polar=True, label='day of week') # If you want the first axis to be on top: ax.set_theta_offset(pi / 2) ax.set_theta_direction(-1) # Draw one axe per variable + add labels labels yet plt.xticks(angles[:-1], categories, fontsize=16) for label, i in zip(ax.get_xticklabels(), range(0, len(angles))): angle_rad = angles[i] if angle_rad == 0: ha = 'center' elif angle_rad <= pi / 2: ha = 'left' elif pi / 2 < angle_rad <= pi: ha = 'left' elif pi < angle_rad <= (3 * pi / 2): ha = 'right' else: ha = 'right' label.set_horizontalalignment(ha) # Draw ylabels ax.set_rlabel_position(0) plt.yticks([a, b, c], [str(a), str(b), str(c)], color="grey", size=12) plt.ylim(0, maxcount) # Ind1 values = day_of_week_df.loc[0].drop('Day').values.flatten().tolist() values += values[:1] ax.plot(angles, values, linewidth=1, linestyle='solid', label=author1_name, color=author1_colour) ax.fill(angles, values, author1_colour, alpha=0.1) # Ind2 values = day_of_week_df.loc[1].drop('Day').values.flatten().tolist() values += values[:1] ax.plot(angles, values, linewidth=1, linestyle='solid', label=author2_name, color=author2_colour) ax.fill(angles, values, author2_colour, alpha=0.1) # Add legend plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1)) plt.savefig(os.path.join("uploads", 'dayofweek.png'), bbox_inches='tight') plot_day_of_week() ### end of SECOND: Day of week ### def timeline_data(): timeline_dictionary = {} timeline_dictionary['date'] = ['Author1', 'Author2'] for i in range(len(df)): t_list = [0, 0] day, author = df.iloc[i, 0], df.iloc[i, 2] if day not in timeline_dictionary: timeline_dictionary[day] = t_list t_list = timeline_dictionary[day] if author == author1: t_list[0] += 1 if author == author2: t_list[1] += 1 timeline_dictionary[day] = t_list return timeline_dictionary timeline_df = pd.DataFrame(timeline_data()) timeline_df = timeline_df.T new_header = timeline_df.iloc[0] timeline_df = timeline_df[1:] timeline_df.columns = new_header ### start of THIRD: timeline ### def plot_timeline(): plt.style.use('fivethirtyeight') plt.rcParams["font.family"] = "Gabriola" plt.rcParams.update({'font.size': 24}) plt.figure(figsize=(20, 8)) plt.xlabel('Timeline', fontsize=30) ax1 = timeline_df.Author1.plot(color=author1_colour) ax2 = timeline_df.Author2.plot(color=author2_colour) ax1.xaxis.set_label_position('top') ax1.legend([author1_name, author2_name], loc='upper right') plt.savefig(os.path.join("uploads", 'timeline.png'), bbox_inches='tight') plot_timeline() ### end of THIRD: timeline ### def top_words(df): top_N = 40 stopwords = nltk.corpus.stopwords.words('english') # RegEx for stopwords RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords)) #RE_stopwords.extend(['from', 'subject', 're', 'edu', 'use']) # replace '|'-->' ' and drop all stopwords words = (df.Message\ .str.lower()\ .replace([RE_stopwords], [''], regex=True)\ .str.cat(sep=' ')\ .split()) words = [word for word in words if len(word) > 3] # generate DF out of Counter rslt = pd.DataFrame(Counter(words).most_common(top_N), columns=['Word', 'Frequency']).set_index('Word') return rslt def hex_to_rgb(hex): hex = hex.lstrip('#') hlen = len(hex) return tuple( int(hex[i:i + hlen // 3], 16) for i in range(0, hlen, hlen // 3)) def rgb_to_hsl(r, g, b): r = float(r) g = float(g) b = float(b) high = max(r, g, b) low = min(r, g, b) h, s, l = ((high + low) / 2, ) * 3 if high == low: h = 0.0 s = 0.0 else: d = high - low s = d / (2 - high - low) if l > 0.5 else d / (high + low) h = { r: (g - b) / d + (6 if g < b else 0), g: (b - r) / d + 2, b: (r - g) / d + 4, }[high] h /= 6 return h, s, l a1_rgb = hex_to_rgb(author1_colour) a2_rgb = hex_to_rgb(author2_colour) a1_hlsva = rgb_to_hsl(a1_rgb[0] / 255, a1_rgb[1] / 255, a1_rgb[2] / 255) a2_hlsva = rgb_to_hsl(a2_rgb[0] / 255, a2_rgb[1] / 255, a2_rgb[2] / 255) a1_hlsva0 = round(a1_hlsva[0] * 355) a1_hlsva1 = round(a1_hlsva[1] * 100) a1_hlsva2 = round(a1_hlsva[2] * 100) a2_hlsva0 = round(a2_hlsva[0] * 355) a2_hlsva1 = round(a2_hlsva[1] * 100) a2_hlsva2 = round(a2_hlsva[2] * 100) ############################ df_1 = top_words(author1_df) df_1.columns d = dict(zip(df_1.index, df_1.Frequency)) plt.style.use('fivethirtyeight') plt.rcParams["font.family"] = "Gabriola" plt.rcParams.update({'font.size': 16}) fileloc = os.path.join("static", shape + '.jpg') mask = np.array(Image.open(fileloc)) wordcloud = WordCloud(background_color='#F0F0F0', mask=mask, width=mask.shape[1], height=mask.shape[0]) wordcloud.generate_from_frequencies(frequencies=d) plt.figure() def a1_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl({0}, {1}%%, %d%%)".format( str(a1_hlsva0), str(a1_hlsva1)) % random.randint(60, 90) plt.imshow(wordcloud.recolor(color_func=a1_color_func), interpolation="bilinear") plt.axis("off") wordcloud.to_file(os.path.join("uploads", 'author1cloud.png')) df_2 = top_words(author2_df) df_2.columns d = dict(zip(df_2.index, df_2.Frequency)) plt.style.use('fivethirtyeight') plt.rcParams["font.family"] = "Gabriola" plt.rcParams.update({'font.size': 16}) wordcloud = WordCloud(background_color='#F0F0F0', mask=mask, width=mask.shape[1], height=mask.shape[0]) wordcloud.generate_from_frequencies(frequencies=d) plt.figure() def a2_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl({0}, {1}%%, %d%%)".format( str(a2_hlsva0), str(a2_hlsva1)) % random.randint(60, 90) plt.imshow(wordcloud.recolor(color_func=a2_color_func), interpolation="bilinear") plt.axis("off") wordcloud.to_file(os.path.join("uploads", 'author2cloud.png')) n_instances = 100 subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] ] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) len(unigram_feats) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) ''' for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value)) ''' df['Message'].unique()[10:20] def sentiment(message): sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(message) return ss['compound'] df["Sentiment"] = df.apply(lambda row: sentiment(row['Message']), axis=1) def sentiment_data(): sentiment_dictionary = {} sentiment_dictionary['date'] = ['Author1', 'Author2'] for i in range(len(df)): t_list = [[0, 0.0], [0, 0.0]] month, author, sentiment = str(df.iloc[i, 0]), df.iloc[i, 2], df.iloc[i, 6] if sentiment != 0.0: month = month.split('-')[0] + '-' + month.split('-')[1] if month not in sentiment_dictionary: sentiment_dictionary[month] = t_list t_list = sentiment_dictionary[month] if author == author1: t_list[0][0] += 1 t_list[0][1] += sentiment if author == author2: t_list[1][0] += 1 t_list[1][1] += sentiment sentiment_dictionary[month] = t_list for x in sentiment_dictionary: if x != 'date': t_list = sentiment_dictionary[x] if t_list[0][0] != 0: t_list[0] = float(t_list[0][1]) / float(t_list[0][0]) else: t_list[0] = 0 if t_list[1][0] != 0: t_list[1] = float(t_list[1][1]) / float(t_list[1][0]) else: t_list[1] = 0 sentiment_dictionary[x] = t_list return sentiment_dictionary sentiment_df = pd.DataFrame(sentiment_data()) sentiment_df = sentiment_df.T new_header = sentiment_df.iloc[0] sentiment_df = sentiment_df[1:] sentiment_df.columns = new_header def plot_sentiment(): plt.style.use('fivethirtyeight') plt.rcParams["font.family"] = "Gabriola" plt.rcParams.update({'font.size': 24}) plt.figure(figsize=(20, 8)) plt.xlabel('Sentiment Analysis', fontsize=30) ax1 = sentiment_df.Author1.plot(color=author1_colour) ax2 = sentiment_df.Author2.plot(color=author2_colour) ax1.xaxis.set_label_position('top') h1, l1 = ax1.get_legend_handles_labels() ax1.legend([author1_name, author2_name], loc='upper right') plt.savefig(os.path.join("uploads", 'sentiment.png'), bbox_inches='tight') plot_sentiment() #number of words def no_of_words(message): return len(message.split()) df["WordCount"] = df.apply(lambda row: no_of_words(row['Message']), axis=1) author1_df = df.loc[(df['Author'] == author1)] author2_df = df.loc[(df['Author'] == author2)] author1_wpm = author1_name + "'s average word per message is {:0.2f}".format( author1_df["WordCount"].mean()) author2_wpm = author2_name + "'s average word per message is {:0.2f}".format( author2_df["WordCount"].mean()) def who_sent_more_words(): if author1_df["WordCount"].sum() > author2_df["WordCount"].sum(): num = author1_df["WordCount"].sum() / author2_df["WordCount"].sum() num = num * 100 - 100 return (author1_name + " sent {:0.0f}% more words than ".format(num) + author2_name) elif author2_df["WordCount"].sum() > author1_df["WordCount"].sum(): num = author2_df["WordCount"].sum() / author1_df["WordCount"].sum() num = num * 100 - 100 return (author2_name + " sent {:0.0f}% more words than ".format(num) + author1_name) else: return ("You both sent the same number of words somehow!") more_words = who_sent_more_words() days = "Number of days of texting: " + str(len(df["Date"].unique())) author1_messages = author1_name + " sent " + str(len( author1_df.index)) + " messages" author2_messages = author2_name + " sent " + str(len( author2_df.index)) + " messages" def who_sent_more(): if len(author1_df.index) > len(author2_df.index): num = len(author1_df.index) / len(author2_df.index) return (author1_name + " sent {:0.2f} times more messages than ".format(num) + author2_name) elif len(author2_df.index) > len(author1_df.index): num = len(author2_df.index) / len(author1_df.index) return (author2_name + " sent {:0.2f} times more messages than ".format(num) + author1_name) else: return ("You both sent the same number of messages somehow!") more_messages = who_sent_more()
return res # def is_stopword(word): # return word in stop # # def is_allowed(word): # if word.endswith("_NEG") and len(word)-4>3: # return True # elif len(word)>3: # return True # return False no_of_reviews = 6000 subj_docs = [(transform(sent), 'subj') for sent in subjectivity.sents(categories='subj')[:no_of_reviews]] obj_docs = [(transform(sent), 'obj') for sent in subjectivity.sents(categories='obj')[:no_of_reviews]] print("subj: %d, obj: %d" % (len(subj_docs), len(obj_docs))) partition = int(0.80 * no_of_reviews) train_subj_docs = subj_docs[:partition] test_subj_docs = subj_docs[partition:] train_obj_docs = obj_docs[:partition] test_obj_docs = obj_docs[partition:] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs # applying filters
conll2002_corp_sents = conll2002.sents() print("conll2002 to sents") conll2007_corp_sents = conll2007.sents() print("condll2007 to sents") inaugural_corp_sents = inaugural.sents() print("inaugural to sents") abc_corp_sents = abc.sents() print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents") movie_reviews_corp_sents = movie_reviews.sents() print("Movie reviews to sents ") guttenberg_corp_sents = gutenberg.sents() print("Guttenberg to sents") treebank_corb_sents = treebank.sents() print("Freebank to sents") reuters_corp_sents = reuters.sents() print("Reuters to sents") webtext_corp_sents = webtext.sents() print("Webtext to sents") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
outputScore.close() return True except Exception as e: print(e) def on_error(self, status): print(status) try: n_instances = 100 subj_docs = [ (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] ] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] len(subj_docs), len(obj_docs) subj_docs[0] train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs + train_obj_docs testing_docs = test_subj_docs + test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
def sample(): n_instances = 200 subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] print "subj docs", len(subj_docs), "obj docs", len(obj_docs)