def corpusreader_demo(): """ Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and * the result of tokenising the raw strings. """ from nltk.corpus import twitter_samples as tweets print() print("Complete tweet documents") print(SPACER) for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: print(json.dumps(tweet, indent=1, sort_keys=True)) print() print("Raw tweet strings:") print(SPACER) for text in tweets.strings("tweets.20150430-223406.json")[:15]: print(text) print() print("Tokenized tweet strings:") print(SPACER) for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: print(toks)
def corpusreader_demo(): """ Use `TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and * the result of tokenising the raw strings. """ from nltk.corpus import twitter_samples as tweets print() print("Complete tweet documents") print(SPACER) for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: print(json.dumps(tweet, indent=1, sort_keys=True)) print() print("Raw tweet strings:") print(SPACER) for text in tweets.strings("tweets.20150430-223406.json")[:15]: print(text) print() print("Tokenized tweet strings:") print(SPACER) for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: print(toks)
def trainMovieTwitter(): negLabeled = [] posLabeled = [] negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') for f in negids: negLabeled.append((word_feats(movie_reviews.words(fileids=[f])), 'neg')) # negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') ] for f in posids: posLabeled.append((word_feats(movie_reviews.words(fileids=[f])), 'pos')) # posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') ] # # train = negfeats + posfeats # tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for it in twitter_samples.docs('negative_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) negLabeled.append((word_feats(tokens), 'neg')) # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None] for it in twitter_samples.docs('positive_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) posLabeled.append((word_feats(tokens), 'pos')) train = negLabeled + posLabeled classifier = NaiveBayesClassifier.train(train) f = open('movieTwitter_semtiment_classifier.pickle', 'wb') pickle.dump(classifier, f) f.close()
def buildWordFeatures(): tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for it in twitter_samples.docs('negative_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "negative")) # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None] for it in twitter_samples.docs('positive_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "positive")) wordFeatures = get_word_features(get_words_in_tweets(labeledTweets)) fout = open('wordFeatures.json', "w+") fout.write(json.dumps(wordFeatures, indent=2)) fout.close()
def count_words(words): wc = {} for word in words: wc[word] = wc.get(word, 0.0) + 1.0 return wc # setup some structures to store our data vocab = {} word_counts = {"pos": {}, "neg": {}} priors = {"pos": 0., "neg": 0.} from nltk.corpus import twitter_samples print "Setting up text analyzer, please wait..." tweets = twitter_samples.docs( 'positive_tweets.json') #Positive tweets to train model all_tweets = [] test_tweets = [] for tweet in tweets[0:2999]: all_tweets.append((tweet['text'], "pos")) for tweet in tweets[3000:4999]: test_tweets.append((tweet['text'], "pos")) tweets = twitter_samples.docs( 'negative_tweets.json') #Negative tweets to train model for tweet in tweets[0:2999]: all_tweets.append((tweet['text'], "neg")) for tweet in tweets[3000:4999]: test_tweets.append((tweet['text'], "neg")) # Build text model for t in all_tweets:
@author: zhoujiagen ''' # from nltk.twitter import Twitter # tw = Twitter() from nltk.corpus import twitter_samples from pprint import pprint # 文件ID fileids = twitter_samples.fileids() print fileids print # 查看文件内容 # 字段的说明: https://dev.twitter.com/overview/api/tweets docs = twitter_samples.docs('tweets.20150430-223406.json') for doc in docs[:1]: pprint(doc) print print # 查看文件中text字段 strings = twitter_samples.strings('tweets.20150430-223406.json') for string in strings[:10]: print(string) print
def twitterClass(): global wordFeatures tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') # print if not os.path.exists(os.path.join(os.getcwd(), 'semtiment_classifier.pickle')): print twitter_samples.fileids() # print movie_reviews.fileids() # print tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for it in twitter_samples.docs('negative_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "negative")) # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None] for it in twitter_samples.docs('positive_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "positive")) # print labeledTweets wordFeatures = get_word_features(get_words_in_tweets(labeledTweets)) print "training" training = classUtil.apply_features(extract_features, labeledTweets) # print training sentimentClassifier = NaiveBayesClassifier.train(training) print "done training" f = open('semtiment_classifier.pickle', 'wb') pickle.dump(sentimentClassifier, f) f.close() else: fin = open('wordFeatures.json', "r") wordFeatures = json.load(fin) fin.close() print wordFeatures f = open('semtiment_classifier.pickle', 'rb') classifier = pickle.load(f) # type: nltk.classify.naivebayes.NaiveBayesClassifier f.close() # text,created_at tweets = [] onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for row in csv.DictReader(open('datafiles/trump.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(extract_features(features)) }) classification = open('trumpClassified.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close() tweets = [] labeledTweets = [] for row in csv.DictReader(open('datafiles/clinton.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(extract_features(features)) }) classification = open('clintonClassified.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close()
def posNegativeTweets(): for it in twitter_samples.docs('negative_tweets.json'): print it for it in twitter_samples.docs('positive_tweets.json'): print it
return result # setup some structures to store our data vocab = {} word_counts = { "pos": {}, "neg": {} } priors = { "pos": 0., "neg": 0. } from nltk.corpus import twitter_samples print "Setting up text analyzer, please wait..." tweets = twitter_samples.docs('positive_tweets.json') #Positive tweets to train model all_tweets = [] test_tweets = [] for tweet in tweets[2000:4999]: all_tweets.append((tweet['text'], "pos")) for tweet in tweets[0:1999]: test_tweets.append((tweet['text'], "pos")) tweets = twitter_samples.docs('negative_tweets.json') #Negative tweets to train model for tweet in tweets[2000:4999]: all_tweets.append((tweet['text'], "neg")) for tweet in tweets[0:1999]: test_tweets.append((tweet['text'], "neg")) # Build text model for t in all_tweets: priors[t[1]] += 1
#downloading sample tweets: import nltk nltk.download('twitter_samples') from nltk.corpus import twitter_samples tweets = twitter_samples.docs() docs = [t['text'] for t in tweets] #initial number of tweets len(docs) #removing duplicate twitter lines: docs_new = [] for i in docs: if i not in docs_new: docs_new.append(i) docs = docs_new #number of tweets after removing duplicates: len(docs) # regex operations # regex operations performed on strings # creating a string for string manipulations import re docs_new = [] for tweet in docs: tweet = re.sub(r'\:{1}\(+', 'sad', tweet) tweet = re.sub(r'\:{1}\(+', 'sad', tweet)