def main():
    tweets = pd.read_csv('tweets.csv')

    LABEL_PATH = "labels/"
    MODEL_PATH = "model/"

    analyzer = SentimentAnalyzer(MODEL_PATH, LABEL_PATH)

    data = defaultdict(lambda: defaultdict(int))
    universities = tweets.iloc[:, 0]
    tweets_text = tweets.iloc[:, 1]

    print("Predicting sentiments...")
    sentiments = analyzer.batch_predict_sentiment(tweets_text)

    for i in range(len(universities)):
        univeristy = universities[i]
        sentiment = sentiments[i]

        data[univeristy][sentiment] += 1

    print(data)

    # calculate the ratio of pos/neg
    university_ratio_posneg = []
    for univeristy in data:
        pos_neg_ratio = data[univeristy]["Positive"] / data[univeristy][
            "Negative"]
        university_ratio_posneg.append((univeristy, pos_neg_ratio))

    university_ratio_posneg.sort(key=lambda x: x[1], reverse=True)

    for univeristy, pos_neg_ratio in university_ratio_posneg:
        print(univeristy, pos_neg_ratio)
Example #2
0
class SentenceCompleter():
    def __init__(self):
        self.generator = Generator()
        self.sentiment_analyzer = SentimentAnalyzer()

        # Initialize everything (takes time)
        self.generator.initialize()
        self.sentiment_analyzer.initialize()

    def complete_sentence(self, prefix):
        iteration = 0
        while iteration < 20:
            sentence = self.generator.complete(prefix)
            logging.info("Generated %s, trial %d" % (sentence, iteration))
            if not self.sentiment_analyzer.is_negative(sentence):
                return sentence
            logging.warning("Negative sentence generated: %s, trial %d" % (sentence, iteration))
            iteration += 1  # Try again

    def complete_prettify_shorten_sentence(self, prefix, length):
        sentence = self.complete_sentence(prefix)
        sentence = sentence.replace("' ", "'")[:length]
        if "." in sentence:
            sentence = sentence[:sentence.rfind(".") + 1]
        return sentence
Example #3
0
 def build_report(self):
     global active_file
     global sub_name
     try:
         active_file = filedialog.askopenfilename(
             initialdir=os.getcwd(),
             title='Open Report',
             filetypes=(("json gz files", "*.gz"), ("all files", "*.*")))
         print(active_file)
         sub_name = os.path.basename(active_file).split('_')[0]
         df = self.data_source.load_from_file(active_file)
         if self.method_selection.get() == 'quick':
             df = apply_sentiment_intensity(df)
         else:
             sentiment_analyzer = SentimentAnalyzer()
             df = sentiment_analyzer.predict(
                 df[::20]
             )  # every Nth record - it is still too slow to process all records
         self.show_report(df)
     except FileNotFoundError as e:
         messagebox.showerror(
             "Error",
             "Invalid file loaded. Please try gathering data again or selecting another dataset."
         )
         print(e)
Example #4
0
    def __init__(self):
        self.generator = Generator()
        self.sentiment_analyzer = SentimentAnalyzer()

        # Initialize everything (takes time)
        self.generator.initialize()
        self.sentiment_analyzer.initialize()
Example #5
0
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from sentiment_analyzer import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer
    def test_init_negative01(self):
        with self.assertRaises(TypeError):
            sent = SentimentAnalyzer(feature_extractor=1,
                                     classifier=self.classifier)

        with self.assertRaises(TypeError):
            sent = SentimentAnalyzer(feature_extractor=self.feature_extractor,
                                     classifier=1)
Example #7
0
class Utility:
    def __init__(self):
        self.sentiment = SentimentAnalyzer()
        self.clf = self.sentiment.clf

    def classifiersVsFeatures(self):
        with open('pickled/features_train.pickle', 'rb') as features_train:
            X_train = pickle.load(features_train)
        with open('pickled/features_test.pickle', 'rb') as features_test:
            X_test = pickle.load(features_test)
        with open('pickled/labels_train.pickle', 'rb') as labels_train:
            y_train = pickle.load(labels_train)
        with open('pickled/labels_test.pickle', 'rb') as labels_test:
            y_test = pickle.load(labels_test)

        num_features = [10000, 50000, 100000, 500000, 1000000]

        acc = []
        for i in range(0, len(self.clf)):
            acc.append([])

        for k in num_features:
            pipeline, model = self.sentiment.trainData(X_train, y_train,
                                                       self.clf, k)
            prediction = self.sentiment.predictData(X_test, model)
            clf_metrics = self.sentiment.evaluate(y_test, prediction)

            for j in range(0, len(self.clf)):
                print(clf_metrics[0][j])
                acc[j].append(
                    clf_metrics[0]
                    [j])  # Append the accuracy of the classifier for each k

        data = []
        for i in range(0, len(self.clf)):
            data.append({'x': num_features, 'y': acc[i]})

        return data

    def showTopFeatures(self, pipeline, n=20):
        vectorizer = pipeline.named_steps['vect']
        clf = pipeline.named_steps['clf']
        feature_names = vectorizer.get_feature_names()

        coefs = sorted(zip(clf.coef_[0], feature_names), reverse=True)
        topn = zip(coefs[:n], coefs[:-(n + 1):-1])

        top_features = []
        for (coef_p, feature_p), (coef_n, feature_n) in topn:
            top_features.append('{:0.4f}{: >25}    {:0.4f}{: >25}'.format(
                coef_p, feature_p, coef_n, feature_n))

        return '\n'.join(top_features)
Example #8
0
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from sentiment_analyzer import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances/2)

    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs+train_neg_docs
    testing_docs = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
                        Instances=n_instances)
 def test_analyze_positive01(self):
     X = OrderedDict([('Первый сайт',
                       ['Ваш банк полный ацтой!', 'Ваш магаз — нормас']),
                      ('Второй сайт', ['Мне пофиг на ваш ресторан'])])
     len_X = sum([len(X[key]) for key in X])
     sent = SentimentAnalyzer(feature_extractor=self.feature_extractor,
                              classifier=self.classifier)
     output = sent.analyze(X)
     self.assertIsInstance(output, tuple)
     self.assertEqual(len(output), 3)
     self.assertIsInstance(output[0], int)
     self.assertIsInstance(output[1], int)
     self.assertIsInstance(output[2], int)
     self.assertEqual(output[0] + output[1] + output[2], len_X)
     del sent
Example #10
0
def sentiment_analysis(translated_articles_path):
    '''
    Generate sentiment
    :param translated_articles_path: translated_articles_path
    :return:
    '''
    sa = SentimentAnalyzer()
    file_names = os.listdir(translated_articles_path)
    for i, file in enumerate(file_names):
        print('\r generate sentiment for {}, {}/{}'.format(
            file, str(i + 1), len(file_names)),
              end='')
        df = pd.read_excel('translated/' + file)
        df['sentiment'] = sa.analise_texts(df['content_translated'])
        df.to_excel(translated_articles_path + file)
    print()
 def test_pickle_unpickle_positive01(self):
     X = OrderedDict({
         'Первый сайт': ['Ваш банк полный ацтой!', 'Ваш магаз — нормас'],
         'Второй сайт': ['Мне пофиг на ваш ресторан']
     })
     sent = SentimentAnalyzer(feature_extractor=self.feature_extractor,
                              classifier=self.classifier)
     output1 = sent.analyze(X)
     with open('sent.pkl', 'wb') as f:
         pickle.dump(sent, f)
     del sent
     with open('sent.pkl', 'rb') as f:
         sent = pickle.load(f)
     output2 = sent.analyze(X)
     self.assertEqual(output1, output2)
     del sent
Example #12
0
def set_up():
    # before doing anything else, initialize the sentiment analyzer
    global sentiment_analyzer
    sentiment_analyzer = SentimentAnalyzer()
    print("initialized sentiment analyzer")

    # first check to see whether or not the data files are empty or not
    # if they are nonempty, then notify, and quit
    if is_non_zero_file(setup.RAW_DATA_PATH) or is_non_zero_file(
            setup.PROCESSED_DATA_PATH):
        print("Data already exists, either at " + setup.RAW_DATA_PATH +
              " or at " + setup.PROCESSED_DATA_PATH + ".")
        print(
            "Delete these files (after having copied its contents, perhaps), and try again."
        )
        print("Measure.py will now self destruct.")
        import sys
        sys.exit()
    # Set up the two data files
    # the raw data file should be in json format
    # thus, write to it an square bracket
    with open(setup.RAW_DATA_PATH, "w") as raw_file:
        raw_file.write("[")
    # the processed data file should be a csv file
    with open(setup.PROCESSED_DATA_PATH, "w") as processed_file:
        processed_file.write(
            "Date,Hate Speech Tally,Offensive But Not Hate Speech Tally,Not Offensive Tally,Combined Score,Total Tally\n"
        )
    # load the user ids
    global user_ids
    with open(setup.USER_IDS_PATH, "r") as user_ids_file:
        for line in user_ids_file:
            sml = [x for x in line.strip().split()]
            user_id = int(sml[0])
            user_ids.append(user_id)
Example #13
0
def courses_by_popularity():
    course_tracker = CourseTracker()
    db = DatabaseManager()
    analyzer = SentimentAnalyzer()
    sentiments = []
    for course in course_tracker.get_all_courses():
        course_comments = db.comments_containing(course.name)
        if len(course_comments) == 0:
            continue
        course_sentiments = [
            analyzer.analyze_sentiment(c.content) for c in course_comments
        ]
        avg_sentiment = sum(course_sentiments) / len(course_sentiments)
        sentiments.append({
            "course": course.name,
            "avg_sentiment": avg_sentiment
        })

    return jsonify(
        sorted(sentiments, key=lambda x: x["avg_sentiment"], reverse=True))
def prepareSentimentAnalyzer():
  # import serializer. try cPickle if available.
  try:
    import cPickle as pickle
  except:
    import pickle

  # try loading a previously serialized sentiment analyzer instance, if possible.
  # otherwise, train from scratch.
  try:
    with open(SA_SERIAL_FILE) as f:
      sa = pickle.load(f)
  except (IOError, pickle.PickleError):
    print 'Preparing sentiment analyzer for the first time. Please wait...'
    sa = SentimentAnalyzer()
    sa.train()
    print 'Done.\n'
    with open(SA_SERIAL_FILE, 'w') as f:
      pickle.dump(sa, f)
  return sa
Example #15
0
def initializeAnalyzer():
    path_to_emoti_file = "../source/lexicon/EmoticonSentimentLexicon.txt"
    path_to_neut_signs_file = "../source/lexicon/NeutralitySigns.txt"
    path_to_polar_nouns = "../source/lexicon/polarized_nouns.txt"
    path_to_polar_verbs = "../source/lexicon/polarazed_verbs.txt"
    path_to_polar_adjectives = "../source/lexicon/polarized_adjectives.txt"
    path_to_polar_conjunctions = "../source/lexicon/polarized_conjunctions.txt"
    path_to_foma_dividers = "../source/lexicon/foma_features_dividers.txt"
    path_to_foma_reversers = "../source/lexicon/foma_features_reversers.txt"
    path_to_meaning_reversers = "../source/lexicon/meaning_reverser.txt"
    path_to_punctuation_signs = "../source/lexicon/punctuation_signs.txt"

    emoti_dict = getEmotiDictionary(file_path=path_to_emoti_file)
    neut_set = getSetOfWordsFromFile(file_path=path_to_neut_signs_file)
    polar_noun = getPolarValues(file_path=path_to_polar_nouns)
    polar_vrb = getPolarValues(file_path=path_to_polar_verbs)
    polar_adj = getPolarValues(file_path=path_to_polar_adjectives)
    polar_conj = getPolarValues(file_path=path_to_polar_conjunctions)
    foma_dividers = getSetOfWordsFromFile(file_path=path_to_foma_dividers)
    foma_reversers = getSetOfWordsFromFile(file_path=path_to_foma_reversers)
    meaning_reversers = getSetOfWordsFromFile(
        file_path=path_to_meaning_reversers)
    punctuation_signs = getSetOfWordsFromFile(
        file_path=path_to_punctuation_signs)
    emoti_dict = organizedListOfEmoties(emoti_dict)
    analyzer = SentimentAnalyzer(emoti_dict=emoti_dict,
                                 neutral_signs=neut_set,
                                 polar_nouns=polar_noun,
                                 polar_verbs=polar_vrb,
                                 polar_adjectives=polar_adj,
                                 polar_conjunctions=polar_conj,
                                 foma_dividers=foma_dividers,
                                 foma_reversers=foma_reversers,
                                 punctuation_signs=punctuation_signs,
                                 meaning_reversers=meaning_reversers,
                                 vrb_prob_coef=3,
                                 sent_coef_decr=.2,
                                 coef_of_postg_change=.1)
    # print("FROM ANALYZER",analyzer.isPartOf("",analyzer.polar_noun))
    return analyzer
Example #16
0
def main():
    # before doing anything else, initialize the sentiment analyzer
    global sentiment_analyzer
    sentiment_analyzer = SentimentAnalyzer()
    print("initialized sentiment analyzer")

    # also initialize the user ids array to all the elements currently in the user ids data file
    # first create the file
    open(setup.USER_IDS_PATH, "a").close()
    # then open it, and load all of its contents into the user_ids list
    global user_ids
    with open(setup.USER_IDS_PATH, "r") as user_ids_file:
        for line in user_ids_file:
            sml = [x for x in line.strip().split()]
            user_id = int(sml[0])
            hatefulness_score = float(sml[1])
            user_ids.append((user_id, hatefulness_score))

    # define two threads: one user_abort, and one setup_streamer (which also, incidentally, starts the streamer)
    abort_thread = Thread(target = user_abort)
    streamer_thread = Thread(target = setup_streamer)
    # start them both
    abort_thread.start()
    streamer_thread.start()
Example #17
0
    except ImportError:
        pass

print('async_mode is ' + async_mode)

import eventlet
eventlet.monkey_patch()
app = Flask(__name__)
app.config['SECRET_KEY'] = 'secret!'
socketio = SocketIO(app, async_mode=async_mode)
thread = None

cursor = db.cursor()
cursor.execute("USE sp_data")

analyzer = SentimentAnalyzer()
analyzer.set_data(positive_tweets, negative_tweets)
#analyzer.train_data()
analyzer.get_training_data()

tweet_query = "SELECT text, ST_X(coordinates) AS lat, ST_Y(coordinates) AS lon, created_at, country_code, lang FROM test_cases"

try:
    cursor.execute(tweet_query)
    test_tweets = cursor.fetchall()
    print "Executing SQL statement"
except:
    print "Error: cannot fetch data!"


def do_analysis():
Example #18
0
def analysisSearch():
    if request.method == 'POST':
        try:
            quantities = []
            query_search = ''
            term = request.form['term']  #Term that user types
            retweets = request.form[
                'retweets']  #if user wants analysis with or without retweets
            retweets = int(retweets)  #converting retweets in int
            if retweets == 1:  #if retweets equals 1 we will exclude retweets and work only with original tweets
                query_search = term + ' -filter:retweets'
                query_search = str(query_search)
            elif retweets == 2:  #condition if retweets equals 2 we will work with original tweets and retweets
                query_search = term
                query_search = str(query_search)
            tweets = tweepy.Cursor(api.search,
                                   q=query_search,
                                   lang='es',
                                   tweet_mode='extended').items(
                                       10)  #getting 150 tweets
            sentiment_analyzer = SentimentAnalyzer(
            )  #instanciate the class SentimentAnalyzer
            scores_list = sentiment_analyzer.get_scores_list(
                tweets)  #get the list of scores without zeros
            array_tweets_score = sentiment_analyzer.array_of_tweets_and_score_method(
            )  #get array of objects with text and score
            arrays_ordered = sentiment_analyzer.order_arrays_list(
            )  #list in order to create arrays ordered
            text_positive = sentiment_analyzer.get_postive_text(
                arrays_ordered)  #get only positive text
            text_negative = sentiment_analyzer.get_negative_text(
                arrays_ordered)  #get only negative text
            text_neutral = sentiment_analyzer.get_neutral_text(
                arrays_ordered)  #get only neutral text
            percentages = sentiment_analyzer.get_percentages(
            )  #get percentages
            positive_quantity = len(
                text_positive)  #get quantity of positive tweets
            negative_quantity = len(
                text_negative)  #get quantity of negative tweets
            neutral_quantity = len(
                text_neutral)  #get quantity of neutral tweets
            total_quantity = positive_quantity + negative_quantity + neutral_quantity  #get quantity of total tweets
            quantities.extend([
                positive_quantity, negative_quantity, neutral_quantity,
                total_quantity
            ])  #insert quantites in array
            data = [{
                "text_positive": text_positive
            }, {
                "text_negative": text_negative
            }, {
                "text_neutral": text_neutral
            }, {
                "percentages": percentages
            }, {
                "quantities": quantities
            }]  #creating the array that will be send to the template
            scores_array = np.array(scores_list)  #for plotting
            sns.set()  #for plotting
            ax = sns.distplot(scores_array)
            plt.show()  #for plotting
            return render_template(
                'showAnalysis.html',
                data=data)  #rendering showAnalysis and passing data as data
        except tweepy.TweepError as e:
            print(e.reason)
Example #19
0
"""

token_list = []

for x in range(len(df)):
    tokens = BagOfWords().get_tokens(df['body'][x][0])
    token_list.append(tokens)

df['body_tokens'] = token_list

# pp.pprint(df)
"""
SentimentAnalyzer
"""

sa = SentimentAnalyzer()
# print(sa.do_pos_sentiment_analysis(df['body_tokens'][0]))
# print(sa.do_neg_sentiment_analysis(df['body_tokens'][0]))

pos_sent_list = []
for x in range(len(df)):
    pos_sent_result = sa.do_pos_sentiment_analysis(df['body_tokens'][x])
    pos_sent_list.append(pos_sent_result[0])

df['% Positive'] = pos_sent_list

neg_sent_list = []
for x in range(len(df)):
    neg_sent_result = sa.do_neg_sentiment_analysis(df['body_tokens'][x])
    neg_sent_list.append(neg_sent_result[0])
# connect to kafka producer
kafka_producer = KafkaProducer(bootstrap_servers=[kafka_host],
                               value_serializer=lambda x:
                               dumps(x).encode('utf-8'))

tr = TweetRetriever()

# shared object between consumer and producer.
queue = Queue(maxsize=20000)

producer = Producer_And_Consume.Producer("producer", queue)
trends_consumer = []
sentiment_analyzers = []
for i in range(no_of_config_key):
    sentiment_analyzer = SentimentAnalyzer(tr, [twitter_keys[i]['streamConsumerKey'],
                                                           twitter_keys[i]['streamConsumerSecret'],
                                                           twitter_keys[i]['streamAccessTokenKey'],
                                                           twitter_keys[i]['streamAccessTokenSecret']])
    try:
        log.info("Starting Producer_And_Consume.ConsumerThread")
        cr = Producer_And_Consume.ConsumerThread(str(i), queue, kafka_producer, sentiments, sentiment_analyzer,
                                                 consumer_sleep_time)
        trends_consumer.append(cr)
        sentiment_analyzers.append(sentiment_analyzer)
        cr.start()
    except Exception as ex:
        log.error("failed to start consumer threads." + str(ex))

aggregator = Aggregator(paths[0], paths[1])

# Elastic Beanstalk application setup
# EB looks for an 'application' callable by default
Example #21
0
def demo_tweets(trainer, n_instances=None, output=None):
    """
    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
    TweetTokenizer.
    Features are composed of:
        - 1000 most frequent unigrams
        - 100 top bigrams (using BigramAssocMeasures.pmi)

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total tweets that have to be used for
        training and testing. Tweets will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.tokenize import TweetTokenizer
    from sentiment_analyzer import SentimentAnalyzer
    from nltk.corpus import twitter_samples, stopwords

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances/2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs+train_neg_docs
    testing_tweets = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],
        top_n=100, min_freq=12)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__,
                        Tokenizer=tokenizer.__class__.__name__, Feats=extr,
                        Results=results, Instances=n_instances)
Example #22
0
from sentiment_analyzer import SentimentAnalyzer
import warnings

if __name__ == '__main__':
    #warnings.simplefilter(action='ignore', category=FutureWarning)
    test_list = [
        "This is a test example, which is very happy and joyous and I am glad that this works",
        "This is a second test case example that is sad and in fact, sucks."
    ]
    ob = SentimentAnalyzer()
    output_sentences = ob.get_string(test_list)
    for i in range(len(test_list)):
        print(test_list[i])
        print(output_sentences[i])
        print("\n\n\n")
# Trainer for the El Nino Tracker
# This file is used to train data for the el nino tracker applicaiton
from sentiment_analyzer import SentimentAnalyzer
from training_data import positive_tweets,negative_tweets

analyzer = SentimentAnalyzer()
analyzer.set_data(positive_tweets,negative_tweets)
print "data set"
print "training data..."
analyzer.train_data()
print "Finished training data. Training data is saved in sp_classifier.pickle file"

Example #24
0
from sentiment_analyzer import SentimentAnalyzer
import json
from flask import Flask
from flask import render_template
from flask import request

app = Flask(__name__)
app.config.from_pyfile('config.py')

movies_list = []
sa = SentimentAnalyzer()


@app.route('/')
def movies():
    with open('data/movies_list.json') as f:
        data = json.load(f)
        movies_list = data["movies"]
    return render_template('index.html', movies=movies_list)


@app.route('/send/<movie_id>', methods=['POST'])
def send(movie_id):
    feedback = request.form['feedback']
    res = sa.predict(feedback)

    return json.dumps({'status': 'OK', 'res': str(res[0])})


if __name__ == '__main__':
    train_model = app.config['TRAIN']
Example #25
0
 def __init__(self):
     self.sentiment = SentimentAnalyzer()
     self.clf = self.sentiment.clf
Example #26
0
if __name__ == "__main__":

    do_pickle = False
    do_train_data = False
    do_fetch_data = False
    do_preprocess_data = False
    do_cross_validation_strategy = False
    do_holdout_strategy = False
    do_analyze_visualize = False

    # Create 'pickled' and 'plots' directories if not exists
    Path('./pickled').mkdir(exist_ok=True)
    Path('./plots').mkdir(exist_ok=True)

    if do_fetch_data or do_preprocess_data or do_cross_validation_strategy or do_holdout_strategy or do_analyze_visualize:
        sentiment = SentimentAnalyzer()

    if do_fetch_data:
        sentiment.getInitialData('datasets/product_reviews.json', do_pickle)

    if do_preprocess_data:
        reviews_df = pd.read_pickle('pickled/product_reviews.pickle')
        sentiment.preprocessData(reviews_df, do_pickle)

    if do_cross_validation_strategy or do_holdout_strategy:
        reviews_df_preprocessed = pd.read_pickle(
            'pickled/product_reviews_preprocessed.pickle')
        print(reviews_df_preprocessed.isnull().values.sum()
              )  # Check for any null values

    if do_cross_validation_strategy:
Example #27
0
    except ImportError:
        pass

print('async_mode is ' + async_mode)

import eventlet
eventlet.monkey_patch()
app = Flask(__name__)
app.config['SECRET_KEY'] = 'secret!'
socketio = SocketIO(app, async_mode=async_mode)
thread = None

cursor = db.cursor()
cursor.execute("USE sp_data")

analyzer = SentimentAnalyzer()
analyzer.set_data(positive_tweets,negative_tweets)
#analyzer.train_data()
analyzer.get_training_data()
	
tweet_query = "SELECT text, ST_X(coordinates) AS lat, ST_Y(coordinates) AS lon, created_at, country_code, lang FROM test_cases"

try:
	cursor.execute(tweet_query)
	test_tweets = cursor.fetchall()
	print "Executing SQL statement"
except:
	print "Error: cannot fetch data!"
	
def do_analysis():
	tweet = { "text": "", "lat": 0, "lon": 0, "created_at": "", "country_code": 0, "lang": "" }
 def test_init_positive01(self):
     sent = SentimentAnalyzer(feature_extractor=self.feature_extractor,
                              classifier=self.classifier)
     del sent
Example #29
0
from sentiment_analyzer import SentimentAnalyzer
from symbol_scanner import SymbolScanner
from twitter_scanner import TwitterScanner
from notifier import Notifier
import time

symbols = SymbolScanner().company_list

for symbol in symbols:
    time.sleep(5)
    twitter = TwitterScanner(symbol)
    tweets = twitter.tweets
    sentiment = SentimentAnalyzer(tweets)
    print "Symbol: " + symbol + " Avg Sentiment: " + str(
        sentiment.average_sentiment)
    #Notifier("YOUR_TWITTER_USERNAME", symbol, sentiment.average_sentiment)
    sentiment.reset()
    twitter.reset()