def classification_preprocess_all_datasets(): """ Preprocesses all datasets to be ready for classification task. This will include stemming, word correction, lower-casing, hashtag removal, special char removal. """ for i in range(0,len(utils.annotated_datasets)): tweetlines = utils.get_dataset(utils.annotated_datasets[i]) tweets = [] for line in tweetlines: if len(line)>1: tweets.append(tweet.to_tweet(line)) # tweets = lower_case(tweets) tweets = remove_hastags_and_users(tweets) tweets = count_emoticons(tweets) tweets = replace_links(tweets) tweets = remove_specialchars(tweets) tweets = correct_words(tweets) tweets = stem(tweets) tweets = tokenize(tweets) tweets = pos_tag(tweets) tweets = count_exclamations(tweets) analyzer = Analyzer(utils.annotated_datasets[i], tweets) stats = analyzer.analyze() print stats #store tweets in pickles... print "Storing pickles..." utils.store_pickles(tweets, utils.annotated_datasets[i][24:len(utils.annotated_datasets[i])-4])
def preprocess_temporal_dataset(): tweetlines = utils.get_dataset(utils.complete_datasets[3]) tweets = [] for line in tweetlines: if len(line) > 1: tweets.append(tweet.to_tweet(line)) tweets = preprocessing.preprocess_tweets(tweets) sentiments = lexicon.perform_google_sentiment_lexicon_lookup(tweets) pickle.dump(sentiments, open("temporal_sentiments", "wb")) pickle.dump(tweets, open("temporal_tweets2", "wb"))
def initial_preprocess_all_datasets(): """ Runs first preprocessing iteration on all datasets. This is the preprocessing routine performed initially on the datasets before annotation. This routine includes duplicate removal """ for i in range(0,len(utils.datasets)): #Fetch from dataset tweets = [] tweetlines = utils.get_dataset(utils.complete_datasets[i]) for tweetline in tweetlines: tweets.append(tweet.to_tweet(tweetline)) #Perform preprocessing tweets = remove_duplicates_and_retweets(tweets) #Store back to dataset tweetlines = [] for t in tweets: tweetlines.append(t.to_tsv()) utils.store_dataset(tweetlines, utils.datasets[i])
replacement_chars = {u"&": u"og", u"6amp;": u"og", u"+": u"og"} if __name__ == '__main__': #Testing # tweets = [Tweet("13:37", "johnarne", "Jeg () haaater drittt!!!? :( #justinbieber"), Tweet("13:37", "johnarne", "Jeg eeelsker @erna_solberg http://www.erna.no :) #love #jernerna" )] # for tweet in tweets: # tweet.set_sentiment("negative") # print tweet tweetlines = utils.get_dataset("test_annotated_data/erna_dataset.tsv") tweets = [] for line in tweetlines: if len(line)>1: tweets.append(tweet.to_tweet(line)) # tweets = lower_case(tweets) tweets = remove_hastags_and_users(tweets) tweets = count_emoticons(tweets) tweets = replace_links(tweets) tweets = remove_specialchars(tweets) for tweet in tweets: print tweet tweets = correct_words(tweets) tweets = stem(tweets) tweets = tokenize(tweets) for tweet in tweets: print tweet.stat_str() tweets = pos_tag(tweets)
def user_annotation(): """ Feed tweets to console one at a time, and ask user for sentiment annotation. """ dataset = utils.select_dataset() text_tweets = utils.get_dataset(dataset) tweets = [] for text_tweet in text_tweets: tweets.append(tweet.to_tweet(text_tweet)) username = raw_input("Name? ... ") print "\n--------------\n" print "Input: " print "\n1: Negative sentiment (Negative opinion). \n2: Neutral/objective sentiment (No opinion). \n3: Positive sentiment (Positive opinion). \n5: Delete the tweet from the dataset. \nx: Cancel sequence. 0: Go back to previous tweet. " print "\n--------------\n" annotated_to = 0 i = 0 while i < len(tweets): # tweets[i].text.encode('utf8') # text = tweets[i].text # tweets[i].text = text.decode('utf8') try: print "Tweet nr. : "+str(i+1) print str(((i+1.0*1.0)/len(tweets)*1.0)*100)+" % done " print unicode(tweets[i].__str__().decode('utf8')) except UnicodeEncodeError: try: print "Tweet nr. : "+str(i+1) print str(tweets[i]) except UnicodeEncodeError: print "Could not print tweet number "+str(i+1) +". Deleting tweet..." tweets.remove(tweets[i]) continue userinput = raw_input("...") while not legal_input(userinput): userinput = raw_input("Unlawful input! Please re-introduce.") if userinput is '1': tweets[i].set_sentiment("negative") elif userinput is '2': tweets[i].set_sentiment("neutral") elif userinput is '3': tweets[i].set_sentiment("positive") elif userinput is '5': print "Deleting tweet..." tweets.remove(tweets[i]) continue elif userinput is '0': i = i-1 continue elif userinput is 'x': break i = i+1 #TODO: need to encode to utf when getting from dataset?!?! #Store the sentiment in file! tweetlines = [] for t in tweets[:i]: if t.get_sentiment() is None: continue tweetlines.append(t.to_tsv()) dir = username+"_annotated_data" if not os.path.exists(dir): os.makedirs(dir) utils.store_dataset(tweetlines, dir+dataset[4:]) print "Domo arigato!"