Example #1
0
    def __init__(self):
        #document represented by a tuple (sentence,labelt)
        n_instances = 100
        subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
        obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

        #split subj and objinstances to keep a balanced uniform class distribution in both train and test sets.
        train_subj_docs = subj_docs[:80]
        test_subj_docs = subj_docs[80:100]
        train_obj_docs = obj_docs[:80]
        test_obj_docs = obj_docs[80:100]
        training_docs = train_subj_docs+train_obj_docs
        testing_docs = test_subj_docs+test_obj_docs

        #train classifier
        sentim_analyzer = SentimentAnalyzer()
        all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

        #use simple unigram word features, handling negation
        unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
        sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

        #apply features to obtain a feature_value representations of our datasets
        training_set = sentim_analyzer.apply_features(training_docs)
        test_set = sentim_analyzer.apply_features(testing_docs)
        self.trainer = NaiveBayesClassifier.train
        self.classifier = sentim_analyzer.train(self.trainer, training_set)
        for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
            print('{0}: {1}'.format(key, value))
        self.sid = SentimentIntensityAnalyzer()
Example #2
0
def train():
    subj_docs = [
        (sent, 'subj')
        for sent in subjectivity.sents(categories='subj')[:n_instances]
    ]
    obj_docs = [(sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:n_instances]]

    train_subj_docs = subj_docs[:80]
    test_subj_docs = subj_docs[80:100]
    train_obj_docs = obj_docs[:80]
    test_obj_docs = obj_docs[80:100]
    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs])

    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
                                                       min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))
Example #3
0
def trainSubjectivity():
    # Subjective vs. objective sentence classifier. Borrows from NLTK Documentation.
    # Plan on using it in larger machine learning sentiment model as pre-processing
    # Must differentiate between objective and subjective
    subjDocs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
    objDocs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]
    nSubj = len(subjDocs)
    nObj = len(objDocs)
    # 90% Training, 10% Test
    subjTrain = int(.9 * nSubj)
    objTrain = int(.9 * nObj)
    trainSubj = subjDocs[:subjTrain]
    testSubj = subjDocs[subjTrain:nSubj]
    trainObj = objDocs[:objTrain]
    testObj = objDocs[objTrain:nObj]
    trainDocs = trainSubj + trainObj
    testDocs = testSubj + testObj
    # Create sentiment class, mark negation, create features (unigram)
    sentiment = SentimentAnalyzer()
    markNegation = sentiment.all_words([mark_negation(doc) for doc in trainDocs])
    unigramFeats = sentiment.unigram_word_feats(markNegation, min_freq=4)
    sentiment.add_feat_extractor(extract_unigram_feats, unigrams=unigramFeats)
    training = sentiment.apply_features(trainDocs)
    testing = sentiment.apply_features(testDocs)
    # Train classifier
    trainer = NaiveBayesClassifier.train
    subjectivityClassifier = sentiment.train(trainer, training)
    joblib.dump(subjectivityClassifier, 'subjectivity.pkl')
    for key, value in sorted(sentiment.evaluate(testing).items()): print('{0}: {1}'.format(key, value))
    ''' 
Example #4
0
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer
Example #5
0
File: util.py Project: DrDub/nltk
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer
Example #6
0
 def GetTrainDataSetForNLTK(self, instances=100):
     subj_docs = [
         (sent, 'subj')
         for sent in subjectivity.sents(categories='subj')[:instances]
     ]
     obj_docs = [
         (sent, 'obj')
         for sent in subjectivity.sents(categories='obj')[:instances]
     ]
     train_subj_docs = subj_docs
     train_obj_docs = obj_docs
     trainSet = train_subj_docs + train_obj_docs
     return trainSet
Example #7
0
def sentiment_analysis(data):
    from nltk.classify import NaiveBayesClassifier
    from nltk.corpus import subjectivity
    from nltk.sentiment import SentimentAnalyzer
    from nltk.sentiment.util import *

    n_instances = 100
    subj_docs = [
        (sent, 'subj')
        for sent in subjectivity.sents(categories='subj')[:n_instances]
    ]
    obj_docs = [(sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:n_instances]]

    train_subj_docs = subj_docs[:80]
    test_subj_docs = subj_docs[80:100]
    train_obj_docs = obj_docs[:80]
    test_obj_docs = obj_docs[80:100]
    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs])

    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
                                                       min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)

    for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))

    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    from nltk import tokenize

    sid = SentimentIntensityAnalyzer()
    for line in data:
        ss = sid.polarity_scores(line['line_text'])
        line['compound'] = ss['compound']
        line['neg'] = ss['neg']
        line['pos'] = ss['pos']
        line['neu'] = ss['neu']
def train_sentiment():
    instances = 8000
    subj = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:instances]]
    obj = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:instances]]
    train_subj = subj
    train_obj = obj
    train_set = train_subj + train_obj
    sentiment = SentimentAnalyzer()
    all_neg = sentiment.all_words([mark_negation(doc) for doc in train_set])
    uni_g = sentiment.unigram_word_feats(all_neg, min_freq=4)
    sentiment.add_feat_extractor(extract_unigram_feats, unigrams=uni_g)
    trained_set = sentiment.apply_features(train_set)
    nb = NaiveBayesClassifier.train
    classifier = sentiment.train(nb, trained_set)
    return classifier
Example #9
0
    def __init__(self, load=True, set_defaults=True):

        # Set global defaults
        self.data_source = "papers.csv"
        self.source = self.keywords = self.authors = self.date = self.text = self.title = []
        # Preload data from master file
        if load == True:
            self.load_data()

            # Prepare default training data for sentiment analysis
        if set_defaults == True:
            training_subjective = [(sentences, "subj") for sentences in sub.sents(categories="subj")[:500]]
            training_objective = [(sentences, "obj") for sentences in sub.sents(categories="obj")[:500]]
            self.training_data = training_objective + training_subjective

        print ("NewsAnalyzer created")
Example #10
0
def subjectivity_classifier():
    from nltk.classify import NaiveBayesClassifier
    from nltk.corpus import subjectivity
    from nltk.sentiment import SentimentAnalyzer
    from nltk.sentiment.util import *
    """
    Initializes and trains categorical subjectivity analyzer
    """
    N_INSTANCES = 100

    subj_docs = [
        (sent, 'subj')
        for sent in subjectivity.sents(categories='subj')[:N_INSTANCES]
    ]
    obj_docs = [(sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:N_INSTANCES]]

    train_subj_docs = subj_docs[:80]
    test_subj_docs = subj_docs[80:]
    train_obj_docs = obj_docs[:80]
    test_obj_docs = obj_docs[80:]
    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs

    sent_analyzer = SentimentAnalyzer()
    all_words_neg = sent_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs])

    unigram_feats = sent_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    print(f"unigram feats: {len(unigram_feats)}")

    sent_analyzer.add_feat_extractor(extract_unigram_feats,
                                     unigrams=unigram_feats)

    training_set = sent_analyzer.apply_features(training_docs)
    test_set = sent_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sent_analyzer.train(trainer, training_set)
    for k, v in sorted(sent_analyzer.evaluate(test_set).items()):
        print(f"{k}: {v}")

    return sent_analyzer
Example #11
0
def analyze_sentiment(paragraph):
    n_instances = 100
    subj_docs = [
        (sent, 'subj')
        for sent in subjectivity.sents(categories='subj')[:n_instances]
    ]
    obj_docs = [(sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:n_instances]]

    train_subj_docs = subj_docs[:80]
    test_subj_docs = subj_docs[80:100]
    train_obj_docs = obj_docs[:80]
    test_obj_docs = obj_docs[80:100]
    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs
    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs])

    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
                                                       min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)

    sid = SentimentIntensityAnalyzer()
    total_sum = 0
    count = 0.0

    sentences = sent_tokenize(paragraph)

    for sentence in sentences:
        total_sum += sid.polarity_scores(sentence)["compound"]
        count += 1

    return total_sum * 10 / count
Example #12
0
def get_objectivity_analyzer():
    n_instances = 100
    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
    
    train_subj_docs = subj_docs
    train_obj_docs = obj_docs
    training_docs = train_subj_docs+train_obj_docs
    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
    
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    
    training_set = sentim_analyzer.apply_features(training_docs)

    trainer = NaiveBayesClassifier.train
    sentiment_classifier = sentim_analyzer.train(trainer, training_set)
    
    return sentim_analyzer
Example #13
0
def build_NBclassifier():
    '''
    The Subjectivity Dataset contains 5000 subjective and 5000 objective processed sentences.
    gets 91% accuracy on 1000 sentence test set.
    '''
    random.seed(1)
    subjective_sents = subjectivity.sents(categories='subj')
    objective_sents = subjectivity.sents(categories='obj')

    sents = []
    for sent in subjective_sents:
        sents.append((sent, 'subj'))
    for sent in objective_sents:
        sents.append((sent, 'obj'))

    random.shuffle(sents)
    train = sents
    cl = NaiveBayesClassifier(train)
    # save the model to disk
    filename = 'NB_Subj_Model.sav'
    pickle.dump(cl, open(filename, 'wb'))
Example #14
0
class SentimentAnalysis(object):
    instance_items = 100
    subjects = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:instance_items]]
    objects = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:instance_items]]

    subject_train = subjects[:80]
    subject_test = subjects[80:100]
    object_train = objects[:80]
    object_test = objects[80:100]
    training_docs = subject_train+object_train
    testing_docs = subject_test+object_test
    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)

    def Analysis_Result(file_url):
        file = open(file_url, "r")
        content = file.read()
        content_list = tokenize.sent_tokenize(content)
        print("Analiz Başladı \n")
        print(content_list)
        print("")
        file.close()

        sentiment_intensity = SentimentIntensityAnalyzer()
        for line in content_list:
            print(line)
            sentiment_score = sentiment_intensity.polarity_scores(line)
            for k in sorted(sentiment_score):
                print('{0}: {1}, '.format(k, sentiment_score[k]), end='')
            print()
Example #15
0
    def prepare_training_and_test_data(self):
        """
        Each document is represented by a tuple (sentence, label). The sentence is tokenized, so it is represented by a list of strings.
        E.g: (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one',
              'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')
        """
        subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:self.n_instances]]
        obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:self.n_instances]]

        # We separately split subjective and objective instances to keep a balanced uniform class distribution in both train and test sets.
        training_end = self.n_training
        testing_start = training_end
        testing_end = testing_start + self.n_testing

        
        train_subj_docs = subj_docs[:training_end]
        test_subj_docs = subj_docs[testing_start:testing_end]

        train_obj_docs = obj_docs[:training_end]
        test_obj_docs = obj_docs[testing_start:testing_end]
        
        self.training_docs = train_subj_docs + train_obj_docs
        self.testing_docs = test_subj_docs + test_obj_docs
Example #16
0
import nltk

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
len(subj_docs), len(obj_docs)
(100, 100)
subj_docs[0]


(['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one',
'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')


train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=1)
len(unigram_feats)

sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import subjectivity
print(subjectivity.categories())
print(subjectivity.sents()[23])
print(subjectivity.words(categories='subj'))
Example #18
0
    def get_tweets(self, query, count=10):

        tweets = []

        try:
            #get the tweets from twitter
            fetched_tweets = self.api.search(q=query, count=count)

            n_instances = 100
            subj_docs = [
                (sent, 'subj')
                for sent in subjectivity.sents(categories='subj')[:n_instances]
            ]
            obj_docs = [
                (sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:n_instances]
            ]

            train_subj_docs = subj_docs[:80]
            test_subj_docs = subj_docs[80:100]
            train_obj_docs = obj_docs[:80]
            test_obj_docs = obj_docs[80:100]
            training_docs = train_subj_docs + train_obj_docs
            testing_docs = test_subj_docs + test_obj_docs
            emotion_analyzer = SentimentAnalyzer()
            #get the negative words for feature extraction
            all_radical_slurs = emotion_analyzer.all_words(
                [mark_negation(doc) for doc in training_docs])

            unigram_feats = emotion_analyzer.unigram_word_feats(
                all_radical_slurs, min_freq=4)

            emotion_analyzer.add_feat_extractor(extract_unigram_feats,
                                                unigrams=unigram_feats)

            training_set = emotion_analyzer.apply_features(training_docs)
            test_set = emotion_analyzer.apply_features(testing_docs)

            trainer = NaiveBayesClassifier.train
            classifier = emotion_analyzer.train(trainer, training_set)

            #test sentences
            sentences = [
                "Ravi is the worst boy in class",
                "The story is full of mean bitchy characters",
                "I had a good day!", "The day was okay",
                "The day was very bad", "Harry potter is a good book",
                "New Tata electric car is a piece of shit",
                "It has been a long time since I had a good food",
                "Stop acting as a asshole"
            ]

            sid = SentimentIntensityAnalyzer()
            for sentence in sentences:
                print(sentence)
                ss = sid.polarity_scores(sentence)
                for k in sorted(ss):
                    print('{0}: {1}, '.format(k, ss[k]), end='')
                print()
            for tweet in fetched_tweets:
                print(tweet.text)
                ss = sid.polarity_scores(tweet.text)
                for k in sorted(ss):
                    print('{0}: {1}, '.format(k, ss[k]), end='')
                print()

            return tweets

        except tweepy.TweepError as e:
            print("Error : " + str(e))
Example #19
0
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

n_instances = 5000
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
print("Len subj_docs and obj docs: ", len(subj_docs), len(obj_docs), "\n", "\n")
print("subj_docs: ", subj_docs, "\n", "obj_docs: ", obj_docs, "\n")
print(len(subjectivity.sents(categories='subj')), "\n")

train_subj_docs = subj_docs[:4980]
test_subj_docs = subj_docs[4980:5000]
train_obj_docs = obj_docs[:4980]
test_obj_docs = obj_docs[4980:5000]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
print(len(unigram_feats), "\n")

sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
Example #20
0
from nltk import tokenize
from news import SimpleArticle

# Resourcing a dataset
nltk.download('subjectivity')
print("Subjectivity ready")
nltk.download('punkt')
print("punkt ready")
nltk.download('vader_lexicon')
print("VADER ready")

## Generating dataset
instances = 100
subjective_sentences = [
    (sent, 'subj')
    for sent in subjectivity.sents(categories='subj')[:instances]
]
objective_sentences = [
    (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:instances]
]

# Divied each dataset into 20% test, 80% train
train_subjective = subjective_sentences[:80]
test_subjective = subjective_sentences[80:]

train_objective = objective_sentences[:80]
test_objective = objective_sentences[80:]

training_docs = train_objective + train_subjective
testing_docs = test_objective + test_subjective
Example #21
0
from features import word_features, document_features
from nltk.corpus import subjectivity
from nltk.corpus import words as wds
import nltk

##Features
numberWords = 100
words = word_features(subjectivity.words(), numberWords)
f = words + wds.words(fileids='en-basic')

##Data Set

subj = [(sentence, 'subj')
        for sentence in subjectivity.sents(categories='subj')]
obj = [(sentence, 'obj') for sentence in subjectivity.sents(categories='obj')]

length = len(subj)
nintyPercent = int(length * .9)

test_tokens = subj[:nintyPercent] + obj[:nintyPercent]
train_tokens = subj[nintyPercent:] + obj[nintyPercent:]

print("Test set length  = " + str(len(test_tokens)))
print("Train set length = " + str(len(train_tokens)))

trainSet = [(document_features(sent, f), category)
            for (sent, category) in train_tokens]
testSet = [(document_features(sent, f), category)
           for (sent, category) in test_tokens]

#Train
# In[ ]:

# In[46]:


def divide_sets(labeled_sents):
    shuffle(labeled_sents)
    ratio = int(len(labeled_sents) * 0.8)
    train, test = labeled_sents[:ratio], labeled_sents[ratio:]
    return train, test


# In[91]:

labeled_sents = [(sent, label) for label in sub.categories()
                 for sent in sub.sents(categories=label)]
trainset, testset = divide_sets(labeled_sents)

# In[92]:

print(trainset[0])
print(testset[0])

# In[93]:

lemtzer = WordNetLemmatizer()
stopws = stopwords.words('english')
fd = nltk.FreqDist(
    lemtzer.lemmatize(w) for sent, label in trainset for w in sent
    if w.isalnum() and w not in stopws)
ftwords = list(fd)[:500]
Example #23
0
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
import pickle

subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]
train_subj_docs = subj_docs[:2500]
test_subj_docs = subj_docs[2500:]
train_obj_docs = obj_docs[:2500]
test_obj_docs = obj_docs[2500:]
training_docs = train_subj_docs + train_obj_docs
testing_docs = test_subj_docs + test_obj_docs

sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words(
    [mark_negation(doc) for doc in training_docs])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                   unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
with open('senti_analyzer.pkl', 'wb') as f:
    pickle.dump(sentim_analyzer, f)
with open('classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)
Example #24
0
# The NLTK's collection of sentiment analysis tools
# More info www.nltk.org

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

## Build training and testing data sets

# Size of dataset(s)

n = 1000

# Get "n" subjective and objective phrases from subjectivity corpus

subjective = [(sentences,'subj') for sentences in subjectivity.sents(categories='subj')[:n]]
objective = [(sentences,'obj') for sentences in subjectivity.sents(categories='obj')[:n]]

# Here's what the first item in "subjective" looks like
# Note that it's stores as (phrase, label)

subjective[0]

# Create separate training and test data sets, this is pretty standard in any data mining/machine learning task
# The typical split is, as seen here (training = 80%, train = 20%)

training_subjective = subjective[:int(.8*n)]
test_subjective = subjective[int(.8*n):n]
training_objective = objective[:int(.8*n)]
test_objective = objective[int(.8*n):n]
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer

n = 100
subj = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n]]
obj = [(sent, 'subj') for sent in subjectivity.sents(categories='obj')[:n]]
train_data = subj + obj
sa = SentimentAnalyzer()
neg_words = sa.all_words([mark_negation(doc) for doc in train_data])
uf = sa.unigram_word_feats(neg_words, min_freq=4)
sa.add_feat_extractor(extract_unigram_feats, unigrams=uf)
training_set = sa.apply_features(train_data)
#test_set = sentim_analyzer.apply_features(testing_docs)
trainer = NaiveBayesClassifier.train
classifier = sa.train(trainer, training_set)


def getPolarity():
    sia = SentimentIntensityAnalyzer()
    sia.polarity_scores(stri)
Example #26
0
# The NLTK's collection of sentiment analysis tools
# More info www.nltk.org

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

## Build training and testing data sets

# Size of dataset(s)

n = 1000

# Get "n" subjective and objective phrases from subjectivity corpus

subjective = [(sentences, "subj") for sentences in subjectivity.sents(categories="subj")[:n]]
objective = [(sentences, "obj") for sentences in subjectivity.sents(categories="obj")[:n]]

# Here's what the first item in "subjective" looks like
# Note that it's stores as (phrase, label)

subjective[0]

# Create separate training and test data sets, this is pretty standard in any data mining/machine learning task
# The typical split is, as seen here (training = 80%, train = 20%)

training_subjective = subjective[: int(0.8 * n)]
test_subjective = subjective[int(0.8 * n) : n]
training_objective = objective[: int(0.8 * n)]
test_objective = objective[int(0.8 * n) : n]
Example #27
0
def training_setup():
	## Build training set from nltk subjectivity/objectivity corpora
	training_subjective = [(sentences,'subj') for sentences in subjectivity.sents(categories='subj')[:5000]]
	training_objective = [(sentences,'obj') for sentences in subjectivity.sents(categories='obj')[:5000]]
	training = training_objective + training_subjective
	return [training]
def train_sentiment_analyzer_subjectivity(n_instances=None):
    if n_instances is not None:
        n_instances = int(n_instances / 2)

    # NLTK's integrated  and subjectivity dataset for the subj training
    subj_docs = [
        (sent, 'subj')
        for sent in subjectivity.sents(categories='subj')[:n_instances]
    ]
    obj_docs = [(sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs

    sentim_analyzer = SentimentAnalyzer()

    all_words = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs])

    stopwords_english = stopwords.words('english')
    punctuation = list(string.punctuation)
    punctuation.append("''")
    punctuation.append("``")
    punctuation.append("—")
    punctuation.append("…")
    punctuation.append("...")
    punctuation.append("--")
    punctuation.append("..")
    stopwords_english.extend(punctuation)
    all_words_clean = []
    for word in all_words:
        if word not in stopwords_english and word not in string.digits:
            all_words_clean.append(word)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_clean,
                                                       min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    testing_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        message = "Your classifier does not provide a show_most_informative_features() method."
        print(message)
        read_write.log_message(message)
        sentim_analyzer.evaluate(testing_set)
    classifier_accuracy_percent = (classify.accuracy(classifier,
                                                     testing_set)) * 100
    message_acc = 'Accuracy of classifier = ' + str(
        classifier_accuracy_percent) + '%'
    print(message_acc)
    read_write.log_message("[INFO]" + LOG_NAME + message_acc)

    save_file(sentim_analyzer, 'files/sa_subjectivity.pickle')
    message = "sa_subjectivity.pickle file saved."
    print(message)
    read_write.log_message(message)
Example #29
0
    bryant = gutenberg.sents('bryant-stories.txt')
    burgess = gutenberg.sents('burgess-busterbrown.txt')
    carroll = gutenberg.sents('carroll-alice.txt')
    ch_ball = gutenberg.sents('chesterton-ball.txt')
    ch_brown = gutenberg.sents('chesterton-brown.txt')
    ch_thurs = gutenberg.sents('chesterton-thursday.txt')
    edge = gutenberg.sents('edgeworth-parents.txt')
    mel = gutenberg.sents('melville-moby_dick.txt')
    mil = gutenberg.sents('milton-paradise.txt')
    caesar = gutenberg.sents('shakespeare-caesar.txt')
    hamlet = gutenberg.sents('shakespeare-hamlet.txt')
    macbeth = gutenberg.sents('shakespeare-macbeth.txt')
    whit = gutenberg.sents('whitman-leaves.txt')
    rural = abc.sents('rural.txt')
    science = abc.sents('science.txt')
    plots = subjectivity.sents('plot.tok.gt9.5000')
    quotes = subjectivity.sents('quote.tok.gt9.5000')
    austen = sense + emma + persuasion
    shakespeare = caesar + hamlet + macbeth
    facts = rural + science
    opinions = plots + quotes
    gute = bryant + burgess + carroll + edge + mel + mil + whit
    chester = ch_ball + ch_brown + ch_thurs
    total = austen + shakespeare + facts + opinions + gute + chester + b + sents
    #print(plots)
    #print(science)
    #print(bible)
    g = Word2Vec(total)
    g.wv.save_word2vec_format('model.bin', binary=True)

Example #30
0
def plot():
    global df
    global author1
    global author2
    global startDate
    global endDate
    global author1_wpm
    global author2_wpm
    global more_words
    global author1_messages
    global author2_messages
    global more_messages
    #restricting range of data
    mask = (df['Date'] >= startDate) & (df['Date'] <= endDate)
    df = df.loc[mask]

    author1_df = df.loc[(df['Author'] == author1)]
    author2_df = df.loc[(df['Author'] == author2)]
    author1_hour = author1_df['Hour'].value_counts()
    author2_hour = author2_df['Hour'].value_counts()

    def time_of_day_data():
        hours_dictionary = {}
        hours_dictionary['hourlist'] = ['Author 1', 'Author 2']
        for i in range(0, 24):
            t_list = [0, 0]
            j = str(i)
            if i < 10:
                j = '0' + j
            if i == 0:
                j = '00'
            if j in author1_hour.index.tolist():
                t_list[0] = author1_hour.loc[j].item()
            if j in author2_hour.index.tolist():
                t_list[1] = author2_hour.loc[j].item()
            hours_dictionary[j] = t_list
        for x in hours_dictionary:
            if x == 'hourlist':
                counter = 0
            elif int(hours_dictionary[x][0]) > counter:
                counter = int(hours_dictionary[x][0])
            elif int(hours_dictionary[x][1]) > counter:
                counter = int(hours_dictionary[x][1])
        return hours_dictionary, counter

    def roundup(x):
        return int(x) if x % 100 == 0 else int(x + 100 - x % 100)

    ### start of FIRST: time of day ###

    def plot_time_of_day():
        plt.style.use('fivethirtyeight')
        plt.style.use('bmh')

        plt.rcParams["font.family"] = "Gabriola"
        plt.rcParams.update({'font.size': 16})

        tod_data, maxcount = time_of_day_data()
        time_of_day_df = pd.DataFrame(tod_data)

        maxcount = roundup(maxcount) + 200
        a = roundup(maxcount / 4)
        b = roundup(maxcount / 2)
        c = roundup(3 * maxcount / 4)
        # No. of variable
        categories = list(time_of_day_df)[1:]
        N = len(categories)
        # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
        angles = [n / float(N) * 2 * pi for n in range(N)]
        angles += angles[:1]
        # Initialise the spider plot
        ax = plt.subplot(111, polar=True, label='time of day')
        # If you want the first axis to be on top:
        ax.set_theta_offset(pi / 2)
        ax.set_theta_direction(-1)
        # Draw one axe per variable + add labels labels yet
        plt.xticks(angles[:-1], categories, fontsize=16)
        # Draw ylabels
        ax.set_rlabel_position(0)
        plt.yticks([a, b, c], [str(a), str(b), str(c)], color="grey", size=12)
        plt.ylim(0, maxcount)

        # Ind1
        values = time_of_day_df.loc[0].drop(
            'hourlist').values.flatten().tolist()
        values += values[:1]
        ax.plot(angles,
                values,
                linewidth=1,
                linestyle='solid',
                label=author1_name,
                color=author1_colour)
        ax.fill(angles, values, author1_colour, alpha=0.1)
        # Ind2
        values = time_of_day_df.loc[1].drop(
            'hourlist').values.flatten().tolist()
        values += values[:1]
        ax.plot(angles,
                values,
                linewidth=1,
                linestyle='solid',
                label=author2_name,
                color=author2_colour)
        ax.fill(angles, values, author2_colour, alpha=0.1)
        # Add legend
        plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
        plt.savefig(os.path.join("uploads", 'timeofday.png'),
                    bbox_inches='tight')

    plot_time_of_day()

    ### end of FIRST: time of day ###

    author1_day = author1_df['Day_of_week'].value_counts()
    author2_day = author2_df['Day_of_week'].value_counts()
    days_in_order = [
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
        'Sunday'
    ]

    def day_of_week_data():
        day_dictionary = {}
        day_dictionary['Day'] = ['Author 1', 'Author 2']
        for dayname in days_in_order:
            t_list = [0, 0]
            if dayname in author1_day.index.tolist():
                t_list[0] = author1_day.loc[dayname].item()
            if dayname in author2_day.index.tolist():
                t_list[1] = author2_day.loc[dayname].item()
            day_dictionary[dayname] = t_list
        for x in day_dictionary:
            if x == 'Day':
                counter = 0
            else:
                temp = max(int(day_dictionary[x][0]),
                           int(day_dictionary[x][1]))
                if temp > counter:
                    counter = temp
        return day_dictionary, counter

    ### start of SECOND: Day of week ###

    def plot_day_of_week():
        plt.style.use('fivethirtyeight')
        plt.style.use('bmh')

        plt.rcParams["font.family"] = "Gabriola"
        plt.rcParams.update({'font.size': 16})

        dow_data, maxcount = day_of_week_data()
        day_of_week_df = pd.DataFrame(dow_data)

        maxcount = roundup(maxcount) + 200
        a = roundup(maxcount / 4)
        b = roundup(maxcount / 2)
        c = roundup(3 * maxcount / 4)
        # number of variable
        categories = list(day_of_week_df)[1:]
        N = len(categories)
        # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
        angles = [n / float(N) * 2 * pi for n in range(N)]
        angles += angles[:1]
        # Initialise the spider plot
        ax = plt.subplot(111, polar=True, label='day of week')
        # If you want the first axis to be on top:
        ax.set_theta_offset(pi / 2)
        ax.set_theta_direction(-1)
        # Draw one axe per variable + add labels labels yet
        plt.xticks(angles[:-1], categories, fontsize=16)

        for label, i in zip(ax.get_xticklabels(), range(0, len(angles))):
            angle_rad = angles[i]
            if angle_rad == 0:
                ha = 'center'
            elif angle_rad <= pi / 2:
                ha = 'left'
            elif pi / 2 < angle_rad <= pi:
                ha = 'left'
            elif pi < angle_rad <= (3 * pi / 2):
                ha = 'right'
            else:
                ha = 'right'

            label.set_horizontalalignment(ha)

        # Draw ylabels
        ax.set_rlabel_position(0)
        plt.yticks([a, b, c], [str(a), str(b), str(c)], color="grey", size=12)
        plt.ylim(0, maxcount)

        # Ind1
        values = day_of_week_df.loc[0].drop('Day').values.flatten().tolist()
        values += values[:1]
        ax.plot(angles,
                values,
                linewidth=1,
                linestyle='solid',
                label=author1_name,
                color=author1_colour)
        ax.fill(angles, values, author1_colour, alpha=0.1)
        # Ind2
        values = day_of_week_df.loc[1].drop('Day').values.flatten().tolist()
        values += values[:1]
        ax.plot(angles,
                values,
                linewidth=1,
                linestyle='solid',
                label=author2_name,
                color=author2_colour)
        ax.fill(angles, values, author2_colour, alpha=0.1)
        # Add legend
        plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
        plt.savefig(os.path.join("uploads", 'dayofweek.png'),
                    bbox_inches='tight')

    plot_day_of_week()

    ### end of SECOND: Day of week ###

    def timeline_data():
        timeline_dictionary = {}
        timeline_dictionary['date'] = ['Author1', 'Author2']
        for i in range(len(df)):
            t_list = [0, 0]
            day, author = df.iloc[i, 0], df.iloc[i, 2]
            if day not in timeline_dictionary:
                timeline_dictionary[day] = t_list
            t_list = timeline_dictionary[day]
            if author == author1:
                t_list[0] += 1
            if author == author2:
                t_list[1] += 1
            timeline_dictionary[day] = t_list
        return timeline_dictionary

    timeline_df = pd.DataFrame(timeline_data())
    timeline_df = timeline_df.T
    new_header = timeline_df.iloc[0]
    timeline_df = timeline_df[1:]
    timeline_df.columns = new_header

    ### start of THIRD: timeline ###

    def plot_timeline():
        plt.style.use('fivethirtyeight')
        plt.rcParams["font.family"] = "Gabriola"
        plt.rcParams.update({'font.size': 24})

        plt.figure(figsize=(20, 8))
        plt.xlabel('Timeline', fontsize=30)

        ax1 = timeline_df.Author1.plot(color=author1_colour)
        ax2 = timeline_df.Author2.plot(color=author2_colour)

        ax1.xaxis.set_label_position('top')
        ax1.legend([author1_name, author2_name], loc='upper right')
        plt.savefig(os.path.join("uploads", 'timeline.png'),
                    bbox_inches='tight')

    plot_timeline()

    ### end of THIRD: timeline ###

    def top_words(df):
        top_N = 40
        stopwords = nltk.corpus.stopwords.words('english')
        # RegEx for stopwords
        RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
        #RE_stopwords.extend(['from', 'subject', 're', 'edu', 'use'])
        # replace '|'-->' ' and drop all stopwords
        words = (df.Message\
            .str.lower()\
                .replace([RE_stopwords], [''], regex=True)\
                    .str.cat(sep=' ')\
                        .split())
        words = [word for word in words if len(word) > 3]
        # generate DF out of Counter
        rslt = pd.DataFrame(Counter(words).most_common(top_N),
                            columns=['Word', 'Frequency']).set_index('Word')
        return rslt

    def hex_to_rgb(hex):
        hex = hex.lstrip('#')
        hlen = len(hex)
        return tuple(
            int(hex[i:i + hlen // 3], 16) for i in range(0, hlen, hlen // 3))

    def rgb_to_hsl(r, g, b):
        r = float(r)
        g = float(g)
        b = float(b)
        high = max(r, g, b)
        low = min(r, g, b)
        h, s, l = ((high + low) / 2, ) * 3
        if high == low:
            h = 0.0
            s = 0.0
        else:
            d = high - low
            s = d / (2 - high - low) if l > 0.5 else d / (high + low)
            h = {
                r: (g - b) / d + (6 if g < b else 0),
                g: (b - r) / d + 2,
                b: (r - g) / d + 4,
            }[high]
            h /= 6
        return h, s, l

    a1_rgb = hex_to_rgb(author1_colour)
    a2_rgb = hex_to_rgb(author2_colour)

    a1_hlsva = rgb_to_hsl(a1_rgb[0] / 255, a1_rgb[1] / 255, a1_rgb[2] / 255)
    a2_hlsva = rgb_to_hsl(a2_rgb[0] / 255, a2_rgb[1] / 255, a2_rgb[2] / 255)

    a1_hlsva0 = round(a1_hlsva[0] * 355)
    a1_hlsva1 = round(a1_hlsva[1] * 100)
    a1_hlsva2 = round(a1_hlsva[2] * 100)

    a2_hlsva0 = round(a2_hlsva[0] * 355)
    a2_hlsva1 = round(a2_hlsva[1] * 100)
    a2_hlsva2 = round(a2_hlsva[2] * 100)

    ############################

    df_1 = top_words(author1_df)
    df_1.columns
    d = dict(zip(df_1.index, df_1.Frequency))

    plt.style.use('fivethirtyeight')
    plt.rcParams["font.family"] = "Gabriola"
    plt.rcParams.update({'font.size': 16})

    fileloc = os.path.join("static", shape + '.jpg')
    mask = np.array(Image.open(fileloc))
    wordcloud = WordCloud(background_color='#F0F0F0',
                          mask=mask,
                          width=mask.shape[1],
                          height=mask.shape[0])
    wordcloud.generate_from_frequencies(frequencies=d)
    plt.figure()

    def a1_color_func(word,
                      font_size,
                      position,
                      orientation,
                      random_state=None,
                      **kwargs):
        return "hsl({0}, {1}%%, %d%%)".format(
            str(a1_hlsva0), str(a1_hlsva1)) % random.randint(60, 90)

    plt.imshow(wordcloud.recolor(color_func=a1_color_func),
               interpolation="bilinear")
    plt.axis("off")
    wordcloud.to_file(os.path.join("uploads", 'author1cloud.png'))

    df_2 = top_words(author2_df)
    df_2.columns
    d = dict(zip(df_2.index, df_2.Frequency))

    plt.style.use('fivethirtyeight')
    plt.rcParams["font.family"] = "Gabriola"
    plt.rcParams.update({'font.size': 16})

    wordcloud = WordCloud(background_color='#F0F0F0',
                          mask=mask,
                          width=mask.shape[1],
                          height=mask.shape[0])
    wordcloud.generate_from_frequencies(frequencies=d)
    plt.figure()

    def a2_color_func(word,
                      font_size,
                      position,
                      orientation,
                      random_state=None,
                      **kwargs):
        return "hsl({0}, {1}%%, %d%%)".format(
            str(a2_hlsva0), str(a2_hlsva1)) % random.randint(60, 90)

    plt.imshow(wordcloud.recolor(color_func=a2_color_func),
               interpolation="bilinear")
    plt.axis("off")
    wordcloud.to_file(os.path.join("uploads", 'author2cloud.png'))

    n_instances = 100
    subj_docs = [
        (sent, 'subj')
        for sent in subjectivity.sents(categories='subj')[:n_instances]
    ]
    obj_docs = [(sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:n_instances]]
    train_subj_docs = subj_docs[:80]
    test_subj_docs = subj_docs[80:100]
    train_obj_docs = obj_docs[:80]
    test_obj_docs = obj_docs[80:100]
    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs
    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
                                                       min_freq=4)
    len(unigram_feats)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)
    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    '''
    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))
    '''
    df['Message'].unique()[10:20]

    def sentiment(message):
        sid = SentimentIntensityAnalyzer()
        ss = sid.polarity_scores(message)
        return ss['compound']

    df["Sentiment"] = df.apply(lambda row: sentiment(row['Message']), axis=1)

    def sentiment_data():
        sentiment_dictionary = {}
        sentiment_dictionary['date'] = ['Author1', 'Author2']
        for i in range(len(df)):
            t_list = [[0, 0.0], [0, 0.0]]
            month, author, sentiment = str(df.iloc[i,
                                                   0]), df.iloc[i,
                                                                2], df.iloc[i,
                                                                            6]
            if sentiment != 0.0:
                month = month.split('-')[0] + '-' + month.split('-')[1]
                if month not in sentiment_dictionary:
                    sentiment_dictionary[month] = t_list
                t_list = sentiment_dictionary[month]
                if author == author1:
                    t_list[0][0] += 1
                    t_list[0][1] += sentiment
                if author == author2:
                    t_list[1][0] += 1
                    t_list[1][1] += sentiment
                sentiment_dictionary[month] = t_list

        for x in sentiment_dictionary:
            if x != 'date':
                t_list = sentiment_dictionary[x]
                if t_list[0][0] != 0:
                    t_list[0] = float(t_list[0][1]) / float(t_list[0][0])
                else:
                    t_list[0] = 0
                if t_list[1][0] != 0:
                    t_list[1] = float(t_list[1][1]) / float(t_list[1][0])
                else:
                    t_list[1] = 0
                sentiment_dictionary[x] = t_list
        return sentiment_dictionary

    sentiment_df = pd.DataFrame(sentiment_data())
    sentiment_df = sentiment_df.T
    new_header = sentiment_df.iloc[0]
    sentiment_df = sentiment_df[1:]
    sentiment_df.columns = new_header

    def plot_sentiment():
        plt.style.use('fivethirtyeight')
        plt.rcParams["font.family"] = "Gabriola"
        plt.rcParams.update({'font.size': 24})

        plt.figure(figsize=(20, 8))
        plt.xlabel('Sentiment Analysis', fontsize=30)

        ax1 = sentiment_df.Author1.plot(color=author1_colour)
        ax2 = sentiment_df.Author2.plot(color=author2_colour)
        ax1.xaxis.set_label_position('top')
        h1, l1 = ax1.get_legend_handles_labels()

        ax1.legend([author1_name, author2_name], loc='upper right')
        plt.savefig(os.path.join("uploads", 'sentiment.png'),
                    bbox_inches='tight')

    plot_sentiment()

    #number of words
    def no_of_words(message):
        return len(message.split())

    df["WordCount"] = df.apply(lambda row: no_of_words(row['Message']), axis=1)
    author1_df = df.loc[(df['Author'] == author1)]
    author2_df = df.loc[(df['Author'] == author2)]
    author1_wpm = author1_name + "'s average word per message is {:0.2f}".format(
        author1_df["WordCount"].mean())
    author2_wpm = author2_name + "'s average word per message is {:0.2f}".format(
        author2_df["WordCount"].mean())

    def who_sent_more_words():
        if author1_df["WordCount"].sum() > author2_df["WordCount"].sum():
            num = author1_df["WordCount"].sum() / author2_df["WordCount"].sum()
            num = num * 100 - 100
            return (author1_name +
                    " sent {:0.0f}% more words than ".format(num) +
                    author2_name)
        elif author2_df["WordCount"].sum() > author1_df["WordCount"].sum():
            num = author2_df["WordCount"].sum() / author1_df["WordCount"].sum()
            num = num * 100 - 100
            return (author2_name +
                    " sent {:0.0f}% more words than ".format(num) +
                    author1_name)
        else:
            return ("You both sent the same number of words somehow!")

    more_words = who_sent_more_words()

    days = "Number of days of texting: " + str(len(df["Date"].unique()))

    author1_messages = author1_name + " sent " + str(len(
        author1_df.index)) + " messages"
    author2_messages = author2_name + " sent " + str(len(
        author2_df.index)) + " messages"

    def who_sent_more():
        if len(author1_df.index) > len(author2_df.index):
            num = len(author1_df.index) / len(author2_df.index)
            return (author1_name +
                    " sent {:0.2f} times more messages than ".format(num) +
                    author2_name)
        elif len(author2_df.index) > len(author1_df.index):
            num = len(author2_df.index) / len(author1_df.index)
            return (author2_name +
                    " sent {:0.2f} times more messages than ".format(num) +
                    author1_name)
        else:
            return ("You both sent the same number of messages somehow!")

    more_messages = who_sent_more()
    return res


# def is_stopword(word):
#     return word in stop
#
# def is_allowed(word):
#     if word.endswith("_NEG") and len(word)-4>3:
#         return True
#     elif len(word)>3:
#         return True
#     return False

no_of_reviews = 6000
subj_docs = [(transform(sent), 'subj')
             for sent in subjectivity.sents(categories='subj')[:no_of_reviews]]
obj_docs = [(transform(sent), 'obj')
            for sent in subjectivity.sents(categories='obj')[:no_of_reviews]]
print("subj: %d, obj: %d" % (len(subj_docs), len(obj_docs)))

partition = int(0.80 * no_of_reviews)

train_subj_docs = subj_docs[:partition]
test_subj_docs = subj_docs[partition:]
train_obj_docs = obj_docs[:partition]
test_obj_docs = obj_docs[partition:]

training_docs = train_subj_docs + train_obj_docs
testing_docs = test_subj_docs + test_obj_docs

# applying filters
Example #32
0
conll2002_corp_sents = conll2002.sents()
print("conll2002 to sents")

conll2007_corp_sents = conll2007.sents()
print("condll2007 to sents")
inaugural_corp_sents = inaugural.sents()
print("inaugural to sents")
abc_corp_sents = abc.sents()
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
movie_reviews_corp_sents = movie_reviews.sents()
print("Movie reviews to sents ")
guttenberg_corp_sents = gutenberg.sents()
print("Guttenberg to sents")
treebank_corb_sents = treebank.sents()
print("Freebank to sents")
reuters_corp_sents = reuters.sents()
print("Reuters to sents")
webtext_corp_sents = webtext.sents()
print("Webtext to sents")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
Example #33
0
            outputScore.close()
            return True

        except Exception as e:
            print(e)

    def on_error(self, status):
        print(status)


try:

    n_instances = 100
    subj_docs = [
        (sent, 'subj')
        for sent in subjectivity.sents(categories='subj')[:n_instances]
    ]
    obj_docs = [(sent, 'obj')
                for sent in subjectivity.sents(categories='obj')[:n_instances]]
    len(subj_docs), len(obj_docs)
    subj_docs[0]
    train_subj_docs = subj_docs[:80]
    test_subj_docs = subj_docs[80:100]
    train_obj_docs = obj_docs[:80]
    test_obj_docs = obj_docs[80:100]
    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs
    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
def sample():
	n_instances = 200

	subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
	obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
	print "subj docs", len(subj_docs), "obj docs", len(obj_docs)