def pre_process(tweet): # print "[original]", tweet #print(tweet) # 0. Removing special Characters #punc = '$%^&*()_+-={}[]:"|\'\~`<>/,' #trans = string.maketrans(punc, ' '*len(punc)) #tweet = tweet.translate(trans) # 1. Normalizing utf8 formatting tweet = tweet.decode("unicode-escape").encode("utf8").decode("utf8") #tweet = tweet.encode("utf-8") tweet = tweet.encode("ascii","ignore") tweet = tweet.strip(' \t\n\r') # 1. Lowercasing tweet = tweet.lower() # print "[lowercase]", tweet # Word-Level tweet = re.sub(' +',' ',tweet) # replace multiple spaces with a single space # 2. Normalizing digits tweet_words = tweet.strip('\r').split(' ') for word in [word for word in tweet_words if word.isdigit()]: tweet = tweet.replace(word, "D" * len(word)) # print "[digits]", tweet # 3. Normalizing URLs tweet_words = tweet.strip('\r').split(' ') for word in [word for word in tweet_words if '/' in word or '.' in word and len(word) > 3]: tweet = tweet.replace(word, "httpAddress") # print "[URLs]", tweet # 4. Normalizing username tweet_words = tweet.strip('\r').split(' ') for word in [word for word in tweet_words if word[0] == '@' and len(word) > 1]: tweet = tweet.replace(word, "usrId") # print "[usrename]", tweet # 5. Removing special Characters punc = '@$%^&*()_+-={}[]:"|\'\~`<>/,' trans = string.maketrans(punc, ' '*len(punc)) tweet = tweet.translate(trans) # print "[punc]", tweet # 6. Normalizing +2 elongated char tweet = re.sub(r"(.)\1\1+",r'\1\1', tweet.decode('utf-8')) # print "[elong]", tweet # 7. tokenization using tweetNLP tweet = ' '.join(twokenize.simpleTokenize(tweet)) # print "[token]", tweet #8. fix \n char tweet = tweet.replace('\n', ' ') #prccd_item_list.append(tweet.strip()) # print "[processed]", tweet.replace('\n', ' ') return tweet
def process(lst): prccd_item_list=[] for tweet in lst: # print "[original]", tweet # print(tweet) # Normalizing utf8 formatting tweet = tweet.decode("unicode-escape").encode("utf8").decode("utf8") #tweet = tweet.encode("utf-8") tweet = tweet.encode("ascii","ignore") tweet = tweet.strip(' \t\n\r') # 1. Lowercasing tweet = tweet.lower() # print "[lowercase]", tweet # Word-Level tweet = re.sub(' +',' ',tweet) # replace multiple spaces with a single space # 2. Normalizing digits tweet_words = tweet.strip('\r').split(' ') for word in [word for word in tweet_words if word.isdigit()]: tweet = tweet.replace(word, "D" * len(word)) # print "[digits]", tweet # 3. Normalizing URLs tweet_words = tweet.strip('\r').split(' ') for word in [word for word in tweet_words if '/' in word or '.' in word and len(word) > 3]: tweet = tweet.replace(word, "httpAddress") # print "[URLs]", tweet # 4. Normalizing username tweet_words = tweet.strip('\r').split(' ') for word in [word for word in tweet_words if word[0] == '@' and len(word) > 1]: tweet = tweet.replace(word, "usrId") # print "[usrename]", tweet # 5. Removing special Characters punc = '@$%^&*()_+-={}[]:"|\'\~`<>/,' trans = string.maketrans(punc, ' '*len(punc)) tweet = tweet.translate(trans) # print "[punc]", tweet # 6. Normalizing +2 elongated char tweet = re.sub(r"(.)\1\1+",r'\1\1', tweet.decode('utf-8')) # print "[elong]", tweet # 7. tokenization using tweetNLP tweet = ' '.join(twokenize.simpleTokenize(tweet)) # print "[token]", tweet #8. fix \n char tweet = tweet.replace('\n', ' ') prccd_item_list.append(tweet.strip()) # print "[processed]", tweet.replace('\n', ' ') return prccd_item_list
def split(filename): tweets = FileFunc.read_file_into_list_unicode('tweets.csv') data = [] for tweet in tweets: tmp = tweet.split(',') data.append({'user': tmp[0], 'tweet': simpleTokenize(tmp[1])}) return data
def normalize_text(text): no_emojs = remove_emojis(text) if len(no_emojs) == 0: return [] toks = simpleTokenize(no_emojs) norm = [] for t in toks: t = t.replace('\n', '') if t.startswith('@'): norm.append(MENTION) else: norm.append(t) return norm
def transform(self, X): """ `X` is expected to be a list of `Datapoint` instances. Return value is a list of `str` instances in which words were tokenized and are separated by a single space " ". Optionally words are also lowercased depending on the argument given at __init__. """ #nltk.WordPunctTokenizer().tokenize #for datapoint in X: # a=(" ".join(simpleTokenize(datapoint.content))+' @'+datapoint.name+' @'+datapoint.nickname+' #'+self.getMin(datapoint.date.split()[1])) it = (" ".join(simpleTokenize(datapoint.content))+' @'+datapoint.name+' #'+self.getMin(datapoint.date.split()[1]) for datapoint in X) if self.lowercase: return [x.lower() for x in it] return list(it)
def transform(self, X): """ `X` is expected to be a list of `Datapoint` instances. Return value is a list of `str` instances in which words were tokenized and are separated by a single space " ". Optionally words are also lowercased depending on the argument given at __init__. """ #nltk.WordPunctTokenizer().tokenize #for datapoint in X: # a=(" ".join(simpleTokenize(datapoint.content))+' @'+datapoint.name+' @'+datapoint.nickname+' #'+self.getMin(datapoint.date.split()[1])) it = (" ".join(simpleTokenize(datapoint.content)) + ' @' + datapoint.name + ' #' + self.getMin(datapoint.date.split()[1]) for datapoint in X) if self.lowercase: return [x.lower() for x in it] return list(it)
def classify(self, tweets): """ Classify tweets Tweets given as input Tweets returned with polarity indicating positive or negative sentiment """ if not self.is_trained(): DLOG("Training is needed. Loading classifier from disk.") # INFORM TO USER THAT TRAINING IS NEED AND CAN TAKE SOME TIME self.load_classifier() self.classify(tweets) else: tokens = [simpleTokenize(tweet.text) for tweet in tweets] filtered_tokens = [filter_tokens(token_set) for token_set in tokens] extraction_start = datetime.datetime.now() # request timing DLOG("Start feature extraction") feature_sets = None if self.use_movie_reviews: feature_sets = [self.feature_extraction_movie_reviews(token_set) for token_set in filtered_tokens] else: feature_sets = [self.feature_extraction(token_set) for token_set in filtered_tokens] DLOG("Feature extraction time: " + str(datetime.datetime.now() - extraction_start)) # request timing classification_start = datetime.datetime.now() # request timing DLOG("Start classification") prob_dists = [] for feature_set in feature_sets: prob_dists.append(self.classifier.prob_classify(feature_set)) DLOG("Classification time: " + str(datetime.datetime.now() - classification_start)) # request timing [tweet.set_polarity(prob_dists[tweets.index(tweet)]) for tweet in tweets] return tweets
def classify_tweets(classifier, tweets, word_features): tokens = [simpleTokenize(tweet.text) for tweet in tweets] filtered_tokens = [filter_tokens(token_set) for token_set in tokens] # feat_set = [] # for tokens in tweets: # feats = {} # feat_tuple = () # for token in tokens: # feats[token] = True # feat_tuple = (feats, ) # feat_set.append(feat_tuple) # print tweets[0:2] # negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] # feat_set = [word_feats(tokens) for tokens in tweets] # feat_set = [((word_feats(tokens)), 'what') for tokens in tweets] feature_sets = [tweet_features(token_set, word_features) for token_set in filtered_tokens] # feat_set = [dict(token=True) for tokens in tweets for token in tokens] # feat_set = [dict(tokens=tokens) for tokens in tweets] # feat_set = [(tweet_features(tokens, word_features)) for tokens in tweets] # print feat_set[0:2] # featuresets = [(document_features(d), d['category']) for d in documents] return_dist = [] for pdist in classifier.prob_classify_many(feature_sets): print('%.3f, %.3f, %.3f ' % (pdist.prob(classifier.labels()[0]), pdist.prob(classifier.labels()[1]), pdist.prob(classifier.labels()[2]))) return_dist.append(pdist) return return_dist
def allbags(tweets): allbags = [] for i in range(len(tweets)): bagofwords = twokenize.simpleTokenize(tweets[i]) allbags.append(bagofwords) return allbags
def train(self, arg=None): """ Trains the classifier with saved train data Sentiment features and classifier are stored in class variables, and also dumped with cPickle """ pos_tweets = read_tweets_from_file("traindata/postweets.txt") neg_tweets = read_tweets_from_file("traindata/negtweets.txt") objective_tweets1 = read_tweets_from_file("traindata/objectivetweets.txt") objective_tweets2 = read_tweets_from_file("traindata/objectivetweets2.txt") objective_tweets3 = read_tweets_from_file("traindata/objectivetweets3.txt") objective_tweets = objective_tweets1 + objective_tweets2 + objective_tweets3 random.shuffle(objective_tweets) # pos_tweets2 = read_tweets_from_file("traindata/happytweets.txt") # pos_tweets.extend(pos_tweets2) # neg_tweets2 = read_tweets_from_file("traindata/sadtweets.txt") # neg_tweets.extend(neg_tweets2) training_tweet_number = 3000 if arg is not None: if arg > 0 & arg < 10000: training_tweet_number = arg pos_tokens = [simpleTokenize(tweet) for tweet in pos_tweets[:training_tweet_number]] neg_tokens = [simpleTokenize(tweet) for tweet in neg_tweets[:training_tweet_number]] objective_tokens = [simpleTokenize(tweet) for tweet in objective_tweets[:training_tweet_number]] pos_filtered_tokens = [filter_tokens(tokens) for tokens in pos_tokens] neg_filtered_tokens = [filter_tokens(tokens) for tokens in neg_tokens] objective_filtered_tokens = [filter_tokens(tokens) for tokens in objective_tokens] pos_tweet_tokens = [dict(tokens=tokens, polarity="positive") for tokens in pos_filtered_tokens] neg_tweet_tokens = [dict(tokens=tokens, polarity="negative") for tokens in neg_filtered_tokens] objective_tweet_tokens = [dict(tokens=tokens, polarity="objective") for tokens in objective_filtered_tokens] all_tokens = pos_tweet_tokens + neg_tweet_tokens + objective_tweet_tokens all_words = FreqDist(t.lower() for d in all_tokens for t in d["tokens"]) self.sentiment_features = all_words.keys() time_stamp = str(datetime.datetime.now())[:19] feature_file = "tweetfeatures/tweet_features_" + time_stamp + ".pkl" with open(feature_file, "wb") as fid: cPickle.dump(self.sentiment_features, fid) random.shuffle(all_tokens) # feature extraction featuresets = [(self.feature_extraction(d["tokens"]), d["polarity"]) for d in all_tokens] feature_length = len(featuresets) train_set, test_set = featuresets[:int(feature_length * 0.8)], featuresets[int(feature_length * 0.8):] self.classifier = NaiveBayesClassifier.train(train_set) DLOG(accuracy(self.classifier, test_set)) self.classifier.show_most_informative_features() classifier_file = "classifier/classifier_" + time_stamp + ".pkl" with open(classifier_file, "wb") as fid: cPickle.dump(self.classifier, fid)
def train(): if OFFLINE: tweets = twitter.get_offline_tweets() test_tweets = twitter.get_offline_test_tweets() else: tweets = twitter.get_training_tweets() test_tweets = twitter.get_test_tweets() # --- Tokenize method - HappyFunTokenizing if TOKENIZER == "HAPPYFUN": tokenizer = happyfuntokenizing.Tokenizer(preserve_case=False) pos_tweet_tokens = [dict(tokens=tokenizer.tokenize(tweet.text), polarity="positive") for tweet in tweets["pos"]] neg_tweet_tokens = [dict(tokens=tokenizer.tokenize(tweet.text), polarity="negative") for tweet in tweets["neg"]] tweets = pos_tweet_tokens + neg_tweet_tokens # print pos_tweet_tokens # - pos_test_tweet_tokens = [dict(tokens=tokenizer.tokenize(tweet.text), polarity="positive") for tweet in test_tweets["pos"]] neg_test_tweet_tokens = [dict(tokens=tokenizer.tokenize(tweet.text), polarity="negative") for tweet in test_tweets["neg"]] test_tweets = pos_test_tweet_tokens + neg_test_tweet_tokens else: pos_tweet_tokens = [dict(tokens=simpleTokenize(tweet.text), polarity="positive") for tweet in tweets["pos"]] neg_tweet_tokens = [dict(tokens=simpleTokenize(tweet.text), polarity="negative") for tweet in tweets["neg"]] tweets = pos_tweet_tokens + neg_tweet_tokens # print pos_tweet_tokens # - pos_test_tweet_tokens = [dict(tokens=simpleTokenize(tweet.text), polarity="positive") for tweet in test_tweets["pos"]] neg_test_tweet_tokens = [dict(tokens=simpleTokenize(tweet.text), polarity="negative") for tweet in test_tweets["neg"]] test_tweets = pos_test_tweet_tokens + neg_test_tweet_tokens # test_tweet_tokens = [tokenizer.tokenize(tweet.text) for tweet in (test_tweets["pos"] + test_tweets["neg"])] all_words = nltk.FreqDist(t.lower() for d in tweets for t in d["tokens"]) word_features = all_words.keys() random.shuffle(tweets) random.shuffle(test_tweets) train_set = [(tweet_features(d["tokens"], word_features), d["polarity"]) for d in tweets] test_set = [(tweet_features(d["tokens"], word_features), d["polarity"]) for d in test_tweets] classifier = nltk.NaiveBayesClassifier.train(train_set) # print classifier.classify(document_features(documents[53])) # print documents[53]['text'][:60] print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features() return classifier, word_features
def training(): pos_tweets = read_tweets_from_file("postweets.txt") neg_tweets = read_tweets_from_file("negtweets.txt") objective_tweets1 = read_tweets_from_file("objectivetweets.txt") objective_tweets2 = read_tweets_from_file("objectivetweets2.txt") objective_tweets3 = read_tweets_from_file("objectivetweets3.txt") objective_tweets = objective_tweets1 + objective_tweets2 + objective_tweets3 random.shuffle(objective_tweets) # pos_tweets2 = read_tweets_from_file("happytweets.txt") # pos_tweets.extend(pos_tweets2) # neg_tweets2 = read_tweets_from_file("sadtweets.txt") # neg_tweets.extend(neg_tweets2) pos_tokens = [simpleTokenize(tweet) for tweet in pos_tweets[:3000]] neg_tokens = [simpleTokenize(tweet) for tweet in neg_tweets[:3000]] objective_tokens = [simpleTokenize(tweet) for tweet in objective_tweets[:3000]] pos_filtered_tokens = [filter_tokens(tokens) for tokens in pos_tokens] neg_filtered_tokens = [filter_tokens(tokens) for tokens in neg_tokens] objective_filtered_tokens = [filter_tokens(tokens) for tokens in objective_tokens] ####### pos_tweet_tokens = [dict(tokens=tokens, polarity="positive") for tokens in pos_filtered_tokens] neg_tweet_tokens = [dict(tokens=tokens, polarity="negative") for tokens in neg_filtered_tokens] objective_tweet_tokens = [dict(tokens=tokens, polarity="objective") for tokens in objective_filtered_tokens] all_tokens = pos_tweet_tokens + neg_tweet_tokens + objective_tweet_tokens # test_tweet_tokens = [tokenizer.tokenize(tweet.text) for tweet in (test_tweets["pos"] + test_tweets["neg"])] all_words = nltk.FreqDist(t.lower() for d in all_tokens for t in d["tokens"]) word_features = all_words.keys() time_stamp = str(datetime.datetime.now())[:19] feature_file = "tweet_features_" + time_stamp + ".pkl" with open(feature_file, "wb") as fid: cPickle.dump(word_features, fid) random.shuffle(all_tokens) # feature extraction? featuresets = [(tweet_features(d["tokens"], word_features), d["polarity"]) for d in all_tokens] # featuresets = [(document_features(d), d['category']) for d in documents] feature_length = len(featuresets) # len(pos_tweets) + len(neg_tweets) train_set, test_set = featuresets[:int(feature_length * 0.8)], featuresets[int(feature_length * 0.8):] # happy_set = [(tweet_features(d["tokens"], word_features), d["polarity"]) for d in happy_tokens] classifier = nltk.NaiveBayesClassifier.train(train_set) # print classifier.classify(document_features(documents[53])) # print documents[53]['text'][:60] print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features() classifier_file = "classifier_" + time_stamp + ".pkl" with open(classifier_file, "wb") as fid: cPickle.dump(classifier, fid) return classifier, word_features
import happyfuntokenizing from twokenize import simpleTokenize print "token test" tokenizer = happyfuntokenizing.Tokenizer(preserve_case=False) for x in xrange(1, 10): print "---- ARK ----------" print simpleTokenize(tweets["pos"][x].text) print "---- HappyFun -----" print tokenizer.tokenize(tweets["pos"][x].text)