Esempio n. 1
0
def corpusreader_demo():
    """
    Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out

    * some full tweets in JSON format;
    * some raw strings from the tweets (i.e., the value of the `text` field); and
    * the result of tokenising the raw strings.

    """
    from nltk.corpus import twitter_samples as tweets

    print()
    print("Complete tweet documents")
    print(SPACER)
    for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
        print(json.dumps(tweet, indent=1, sort_keys=True))

    print()
    print("Raw tweet strings:")
    print(SPACER)
    for text in tweets.strings("tweets.20150430-223406.json")[:15]:
        print(text)

    print()
    print("Tokenized tweet strings:")
    print(SPACER)
    for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
        print(toks)
Esempio n. 2
0
def corpusreader_demo():
    """
    Use `TwitterCorpusReader` tp read a file of tweets, and print out

    * some full tweets in JSON format;
    * some raw strings from the tweets (i.e., the value of the `text` field); and
    * the result of tokenising the raw strings.

    """
    from nltk.corpus import twitter_samples as tweets

    print()
    print("Complete tweet documents")
    print(SPACER)
    for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
        print(json.dumps(tweet, indent=1, sort_keys=True))

    print()
    print("Raw tweet strings:")
    print(SPACER)
    for text in tweets.strings("tweets.20150430-223406.json")[:15]:
        print(text)

    print()
    print("Tokenized tweet strings:")
    print(SPACER)
    for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
        print(toks)
def trainMovieTwitter():
    negLabeled = []
    posLabeled = []
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    for f in negids:
        negLabeled.append((word_feats(movie_reviews.words(fileids=[f])), 'neg'))
    # negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') ]

    for f in posids:
        posLabeled.append((word_feats(movie_reviews.words(fileids=[f])), 'pos'))
    # posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') ]

    # # train = negfeats + posfeats
    #
    tknzr = TweetTokenizer(strip_handles=True)
    onlyWords = re.compile('^[a-zA-Z]+$')
    labeledTweets = []

    for it in twitter_samples.docs('negative_tweets.json'):
        tokens = []
        for token in tknzr.tokenize(it['text']):
            if onlyWords.match(token) is not None:
                tokens.append(token.lower())
        negLabeled.append((word_feats(tokens), 'neg'))
        # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None]

    for it in twitter_samples.docs('positive_tweets.json'):
        tokens = []
        for token in tknzr.tokenize(it['text']):
            if onlyWords.match(token) is not None:
                tokens.append(token.lower())
        posLabeled.append((word_feats(tokens), 'pos'))

    train = negLabeled + posLabeled

    classifier = NaiveBayesClassifier.train(train)

    f = open('movieTwitter_semtiment_classifier.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
def buildWordFeatures():
    tknzr = TweetTokenizer(strip_handles=True)
    onlyWords = re.compile('^[a-zA-Z]+$')
    labeledTweets = []
    for it in twitter_samples.docs('negative_tweets.json'):
        tokens = []
        for token in tknzr.tokenize(it['text']):
            if onlyWords.match(token) is not None:
                tokens.append(token.lower())
        labeledTweets.append((tokens, "negative"))
        # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None]

    for it in twitter_samples.docs('positive_tweets.json'):
        tokens = []
        for token in tknzr.tokenize(it['text']):
            if onlyWords.match(token) is not None:
                tokens.append(token.lower())
        labeledTweets.append((tokens, "positive"))

    wordFeatures = get_word_features(get_words_in_tweets(labeledTweets))
    fout = open('wordFeatures.json', "w+")
    fout.write(json.dumps(wordFeatures, indent=2))
    fout.close()
Esempio n. 5
0
def count_words(words):
    wc = {}
    for word in words:
        wc[word] = wc.get(word, 0.0) + 1.0
    return wc


# setup some structures to store our data
vocab = {}
word_counts = {"pos": {}, "neg": {}}
priors = {"pos": 0., "neg": 0.}

from nltk.corpus import twitter_samples

print "Setting up text analyzer, please wait..."
tweets = twitter_samples.docs(
    'positive_tweets.json')  #Positive tweets to train model
all_tweets = []
test_tweets = []
for tweet in tweets[0:2999]:
    all_tweets.append((tweet['text'], "pos"))
for tweet in tweets[3000:4999]:
    test_tweets.append((tweet['text'], "pos"))
tweets = twitter_samples.docs(
    'negative_tweets.json')  #Negative tweets to train model
for tweet in tweets[0:2999]:
    all_tweets.append((tweet['text'], "neg"))
for tweet in tweets[3000:4999]:
    test_tweets.append((tweet['text'], "neg"))

# Build text model
for t in all_tweets:
Esempio n. 6
0
@author: zhoujiagen
'''

# from nltk.twitter import Twitter
# tw = Twitter()

from nltk.corpus import twitter_samples
from pprint import pprint

# 文件ID
fileids = twitter_samples.fileids()
print fileids

print

# 查看文件内容
# 字段的说明: https://dev.twitter.com/overview/api/tweets
docs = twitter_samples.docs('tweets.20150430-223406.json')
for doc in docs[:1]:
    pprint(doc)
    print

print

# 查看文件中text字段
strings = twitter_samples.strings('tweets.20150430-223406.json')
for string in strings[:10]:
    print(string)

print
def twitterClass():
    global wordFeatures
    tknzr = TweetTokenizer(strip_handles=True)
    onlyWords = re.compile('^[a-zA-Z]+$')
    # print
    if not os.path.exists(os.path.join(os.getcwd(), 'semtiment_classifier.pickle')):
        print twitter_samples.fileids()
        # print movie_reviews.fileids()
        # print

        tknzr = TweetTokenizer(strip_handles=True)
        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []

        for it in twitter_samples.docs('negative_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "negative"))
            # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None]

        for it in twitter_samples.docs('positive_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "positive"))

        # print  labeledTweets
        wordFeatures = get_word_features(get_words_in_tweets(labeledTweets))
        print "training"
        training = classUtil.apply_features(extract_features, labeledTweets)
        # print training

        sentimentClassifier = NaiveBayesClassifier.train(training)
        print "done training"
        f = open('semtiment_classifier.pickle', 'wb')
        pickle.dump(sentimentClassifier, f)
        f.close()
    else:
        fin = open('wordFeatures.json', "r")
        wordFeatures = json.load(fin)
        fin.close()
        print wordFeatures
        f = open('semtiment_classifier.pickle', 'rb')
        classifier = pickle.load(f)  # type: nltk.classify.naivebayes.NaiveBayesClassifier
        f.close()
        # text,created_at
        tweets = []

        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/trump.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('trumpClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
        tweets = []
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/clinton.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('clintonClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
def posNegativeTweets():
    for it in twitter_samples.docs('negative_tweets.json'):
        print it

    for it in twitter_samples.docs('positive_tweets.json'):
        print it
	return result

# setup some structures to store our data
vocab = {}
word_counts = {
	"pos": {},
	"neg": {}
}
priors = {
	"pos": 0.,
	"neg": 0.
}

from nltk.corpus import twitter_samples
print "Setting up text analyzer, please wait..."
tweets = twitter_samples.docs('positive_tweets.json') #Positive tweets to train model
all_tweets = []
test_tweets = []
for tweet in tweets[2000:4999]:
	all_tweets.append((tweet['text'], "pos"))
for tweet in tweets[0:1999]:
	test_tweets.append((tweet['text'], "pos"))
tweets = twitter_samples.docs('negative_tweets.json') #Negative tweets to train model
for tweet in tweets[2000:4999]:
	all_tweets.append((tweet['text'], "neg"))
for tweet in tweets[0:1999]:
	test_tweets.append((tweet['text'], "neg"))

# Build text model
for t in all_tweets:
	priors[t[1]] += 1
Esempio n. 10
0
#downloading sample tweets:
import nltk

nltk.download('twitter_samples')
from nltk.corpus import twitter_samples

tweets = twitter_samples.docs()
docs = [t['text'] for t in tweets]

#initial number of tweets
len(docs)

#removing duplicate twitter lines:
docs_new = []
for i in docs:
    if i not in docs_new:
        docs_new.append(i)
docs = docs_new

#number of tweets after removing duplicates:
len(docs)

# regex operations
# regex operations performed on strings
# creating a string for string manipulations
import re

docs_new = []
for tweet in docs:
    tweet = re.sub(r'\:{1}\(+', 'sad', tweet)
    tweet = re.sub(r'\:{1}\(+', 'sad', tweet)