コード例 #1
0
ファイル: twitterapi.py プロジェクト: GuilhermeFerreira08/Git
 def get_tweet_JsonFiles(self, json_file2: None):
     if (json_file2 == None):
         all_tweets_samples = twitter_samples.fileids()
         json_file = all_tweet_samples[2]  #json file
         tweet_string = twitter_samples.strings(json_file)
         return tweet_string
     tweet_string = json_file2
     return tweet_string
コード例 #2
0
ファイル: test_corpus.py プロジェクト: davidam/damenltk
 def test_corpus_twitter_method_returns_correct_result(self):
     self.assertEqual(twitter_samples.fileids(), [
         'negative_tweets.json', 'positive_tweets.json',
         'tweets.20150430-223406.json'
     ])
     self.assertEqual(
         twitter_samples.strings('negative_tweets.json')[0],
         'hopeless for tmr :(')
     self.assertEqual(
         twitter_samples.strings('positive_tweets.json')[0],
         '#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'
     )
コード例 #3
0
 def validate(self,classifier):
     """Test the accuracy of a given classifier against a test dataset with labels.
     Args:
         classifier: (Bayesain,DecisionTree,SVC,LinearSVC) for use in classifying data
     Returns:
         None
     """
     tweets =  twitter_samples.fileids()
     pos_tweets = twitter_samples.tokenized(tweets[1])
     neg_tweets = twitter_samples.tokenized(tweets[0])
     pos_testing = pos_tweets[(len(pos_tweets)*7/8):]
     neg_testing = neg_tweets[(len(neg_tweets)*7/8):]
     pos_test  = [(self.train_feats(f), 'positive') for f in pos_testing ]
     neg_test = [(self.train_feats(f), 'negative') for f in neg_testing ]
     testfeats = pos_test + neg_test
     print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)
コード例 #4
0
 def parseTweets(self):
     """Parses tweets and extracts features from it
     Args:
         none
     Returns:
         extract_feats: list of words with frequency of each word occuring in both positive and negative classes.
     """
     tweets = twitter_samples.fileids()
     pos_tweets = twitter_samples.tokenized(tweets[1])
     neg_tweets = twitter_samples.tokenized(tweets[0])
     pos_training = pos_tweets[:(len(pos_tweets) * 7 / 8)]
     neg_training = neg_tweets[:(len(pos_tweets) * 7 / 8)]
     pos_feats = [(self.extract_feats(f), 'positive') for f in pos_training]
     neg_feats = [(self.extract_feats(f), 'negative') for f in neg_training]
     train_feats = pos_feats + neg_feats
     print '[-] Feature Extraction Finished'
     return train_feats
nltk.corpus, stopwords from nltk.corpus, PorterStemmer from nltk.stem.porter,
ngrams from nltk, and re. It then imports a corpus of sample tweets about
Brexit. The tweets are cleaned to remove special characters, hashtags,
and twitter user IDs. The tweet text is then cleaned, tokenized, 
and stemmed. Finally, we compute frequency distributions to try to
determine the most frequently used messages in the tweets. 
'''

import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import ngrams
import re

twitter_samples.fileids()
tweets = twitter_samples.strings(twitter_samples.fileids()[-1])
porter_stemmer = PorterStemmer()

def clean_text(tweet):         
	tweet 	= tweet.strip()   
	pattern = "(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)"      
	cleaned_tweet = ' '.join(re.sub(pattern," ",tweet).split())
	wordTokens = nltk.word_tokenize(cleaned_tweet)
	wordTokens = [token.lower() for token in wordTokens if len(token)>1]
	stops = set(stopwords.words("english"))
	wordTokens = [token for token in wordTokens if token not in stops]
	cleaned_tweet = ' '.join(wordTokens)
	return cleaned_tweet

# Remove special characters, stopwords, twitter IDs, and hashtags.
コード例 #6
0
lm_classifier = lmModel.fit(training_features, training_labels)
predictions = lm_classifier.predict(test_features)
print("Precision of LinearRegression is")
precision = calculate_precision(predictions, test_gold_labels)
print("Test data\t" + str(precision))
#Real time tesing
real_time_test(lm_classifier, vocabulary_mv)

###twitter sentiment analyzer
import re

nltk.download('words')
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
ts.fileids()
twitt_str = ts.strings('tweets.20150430-223406.json')
twitt_token = ts.tokenized('tweets.20150430-223406.json')
#print(twitt_str)
print(len(twitt_token))

data_words = nltk.word_tokenize(str(twitt_str))

data_words = [
    data_words.lower() for data_words in data_words if data_words.isalpha()
]
stemmed = [porter.stem(data_word) for data_word in data_words]
print(stemmed)
print(len(stemmed))

wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  1 17:09:48 2016

@author: Admin
"""
import nltk

from nltk.corpus import twitter_samples
twitter_samples.fileids()
"""Accessing json file of positive tweets"""
positive = nltk.corpus.twitter_samples.raw("positive_tweets.json")
positive.__str__()

postwts = nltk.word_tokenize(positive)
"""Length of all the positive tweets"""
len(set(postwts))

from nltk.corpus import twitter_samples
twitter_samples.fileids()
"""Accessing json file of negative tweets"""
tweet = twitter_samples.raw("negative_tweets.json")
tweet.__str__()

tokens = nltk.word_tokenize(tweet)
"""Length of all the negative tweets"""
len(set(tokens))
コード例 #8
0
#! /usr/bin/env python

from nltk.corpus import twitter_samples
import json
import csv

print twitter_samples.fileids()

pos_output_file = "pos_tweets_list.txt"
neg_output_file = "neg_tweets_list.txt"


def clean_up_files(filename):
    data = list()
    with open(filename, 'r') as f:
        for line in f:
            if len(line) > 1:
                data.append(line)

    with open(filename, 'w') as f:
        for line in data:
            f.write(line)


pos_tweets_file = twitter_samples.abspath(twitter_samples.fileids()[1])
pos_tweets_output = open(pos_output_file, 'w+')
with open(pos_tweets_file, 'r') as tf:
    for line in tf:
        x = json.loads(line)
        tweet = x['text'].encode('UTF-8')
        if '\n' not in tweet:
コード例 #9
0
def twitterClass():
    global wordFeatures
    tknzr = TweetTokenizer(strip_handles=True)
    onlyWords = re.compile('^[a-zA-Z]+$')
    # print
    if not os.path.exists(os.path.join(os.getcwd(), 'semtiment_classifier.pickle')):
        print twitter_samples.fileids()
        # print movie_reviews.fileids()
        # print

        tknzr = TweetTokenizer(strip_handles=True)
        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []

        for it in twitter_samples.docs('negative_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "negative"))
            # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None]

        for it in twitter_samples.docs('positive_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "positive"))

        # print  labeledTweets
        wordFeatures = get_word_features(get_words_in_tweets(labeledTweets))
        print "training"
        training = classUtil.apply_features(extract_features, labeledTweets)
        # print training

        sentimentClassifier = NaiveBayesClassifier.train(training)
        print "done training"
        f = open('semtiment_classifier.pickle', 'wb')
        pickle.dump(sentimentClassifier, f)
        f.close()
    else:
        fin = open('wordFeatures.json', "r")
        wordFeatures = json.load(fin)
        fin.close()
        print wordFeatures
        f = open('semtiment_classifier.pickle', 'rb')
        classifier = pickle.load(f)  # type: nltk.classify.naivebayes.NaiveBayesClassifier
        f.close()
        # text,created_at
        tweets = []

        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/trump.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('trumpClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
        tweets = []
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/clinton.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('clintonClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
コード例 #10
0
import nltk
from nltk.corpus import twitter_samples

nltk.download('twitter_samples')

print("Files: ", twitter_samples.fileids())

tweets = twitter_samples.strings('tweets.20150430-223406.json')
print("Total tweets: ", len(tweets))
for tweet in tweets[:10]:
    print(tweet)
コード例 #11
0
ファイル: twitter_cleaned.py プロジェクト: pbexe/news-graph
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import word_tokenize
import re
from collections import Counter
import plotly.plotly as py
import plotly.graph_objs as go


print twitter_samples.fileids()
stop_words = set(stopwords.words('english'))
negTweets = twitter_samples.strings('negative_tweets.json')
posTweets = twitter_samples.strings('positive_tweets.json')
negWords = []
posWords = []
for tweet in posTweets:
	[posWords.append(x) for x in [w for w in word_tokenize(re.sub(r'RT |@\S*|#\S+|http\S+|\n-|w/|[\.]{2,}','',tweet)) if not w in stop_words]]
for tweet in negTweets:
	[negWords.append(x) for x in [w for w in word_tokenize(re.sub(r'RT |@\S*|#\S+|http\S+|\n-|w/|[\.]{2,}','',tweet)) if not w in stop_words]]

results = {}
posWords=Counter(posWords)
negWords=Counter(negWords)
for word in posWords:
	if word in negWords:
		results[word] = posWords[word] - negWords[word]
	else:
		results[word] = posWords[word]
for word in negWords:
	if not word in results:
		results[word] = 0 - negWords[word]
コード例 #12
0
from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile
from nltk.corpus import twitter_samples
#retrieving twitter data and interfacing with API
oauth = credsfromfile()
#tw = Twitter()
#tw.tweets(to_screen=False, limit=25)

#sample public twitter stream
#client = Streamer(**oauth)
#client.register(TweetViewer(limit=10))
#client.sample()

#client = Query(**oauth)
#tweets = client.search_tweets(keywords='nltk', limit = 50)
#tweet = next(tweets)
#from pprint import pprint
#pprint(tweet, depth = 1)

print twitter_samples.fileids()

strings = twitter_samples.strings('tweets.20150430-223406.json')
for string in strings[:15]:
    print(string)
コード例 #13
0
 def __init__(self):
     self.number_id = 41
     self.source_id = "twitter_samples"
     self.titles = [name for name in twitter_samples.fileids()]
     self.data = [twitter_samples.raw(name) for name in self.titles]
コード例 #14
0
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv

#corpus twitter_sample tweets ~20k
jsonfile = twitter_samples.fileids()[-1]

#absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path
input_file = twitter_samples.abspath(jsonfile) #returns system /usr/share/ path

#with open(input_file) as fp:
	#json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3])

#think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. 
with open(input_file) as fp:
	json2csv(fp, 'tweets_dataframe.csv',['id','text','user.favourites_count','user.id','lang','user.followers_count','user.verified','truncated'])
#json, csv
コード例 #15
0
from nltk.corpus import twitter_samples, TwitterCorpusReader
import numpy as np
import matplotlib.pyplot as plt
from NBClassifier import NBClassifier
from SCClassifier import SCClassifier
from BGClassifier import BGClassifier
from sklearn.metrics import roc_curve, auc
import os
import pickle

# settings
fileIds = twitter_samples.fileids()
root = twitter_samples.root

negReader = TwitterCorpusReader(root, fileIds[0])
negTwt = []
posReader = TwitterCorpusReader(root, fileIds[1])
posTwt = []
for tweet in negReader.docs():
    negTwt.append((tweet['text']))
for tweet in posReader.docs():
    posTwt.append((tweet['text']))

posInd = np.random.permutation(len(posTwt))
negInd = np.random.permutation(len(negTwt))

X_1 = np.array([])
X_2 = np.array([])
X_3 = np.array([])
Y = np.array([])
NB_auc = np.zeros((5, 1))
コード例 #16
0
        tokenTweet = tknzr.tokenize(tweet)
        j = 0
        k = 0
        while j < (len(tokenTweet) - k):
            #print j
            if tokenTweet[j][0] == "#":
                tokenTweet[j] = tokenTweet[j][1:]
            elif tokenTweet[j][0] == "@":
                del tokenTweet[j]
                j-=1
                k+=1
            j+=1
            
        info.append((word_feats(tokenTweet), classification))

ids = twitter_samples.fileids()
neg = 0
pos = 1
negtweets = "negtweets.txt"
postweets = "postweets.txt"

#tags unused
neginfo = []
posinfo = []
negtags = []
postags = []
getTweetTokens('neg', ids[0], neginfo, negtags)
getTweetTokens('pos', ids[1], posinfo, postags)

##for i in range(2):
##    print str(posinfo[i])
コード例 #17
0
'''
This is sentiment analysis from nltk samples and corpora
Twitter samples will be used as data
'''

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
import random

# Collect data
print(twitter_samples.fileids())


def create_word_features(words):
    useful_words = [
        word for word in words if word not in stopwords.words('english')
    ]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict


## tweets collection
neg_strings = twitter_samples.strings('negative_tweets.json')
neg_tweets = []
for i, string in enumerate(neg_strings):
    # clean out smileys in strings
コード例 #18
0
from collections import defaultdict

# First thing to do Installing NLTK
nltk.download()

# Secondly download Twitter_sample
nltk.download('twitter_samples')

# Finally we download the stopwords
nltk.download('stopwords')

# The twitter_samples corpus contains 3 files: 5000 positive tweets, 5000 negative tweets and 20.000 positive and negative tweets
# For this project, we will only be using a 10.000  dataset "twitter_sample" already
# available in the "nltk.corpus module" i.e., the files of the 5000 postive and 5000 negative tweets

print("Different type of tweet =>", twitter_samples.fileids())
pos_tweets = twitter_samples.strings('positive_tweets.json')
print("Len of POSITIVE tweet", len(pos_tweets))  #output : 5000
neg_tweets = twitter_samples.strings('negative_tweets.json')
print("Len of NEGATIVE tweet", len(neg_tweets))  #output : 5000
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
print("Length of TOTAL tweet from tweets.20150430-223406.json",
      len(all_tweets))  #output : 20000

# We import the TweetTokenizer Module first and then tokenize(split text into list)
tweet_tokenizer = TweetTokenizer(preserve_case=False,
                                 strip_handles=True,
                                 reduce_len=True)

# Denoise the tweet by removing $GE, $RT, hyperlink, #, words like a, and, the, is, are, etc, emoticones,
# punctuations and then convert word to Stem/Base by using Porter Stemming algorithmz