def classification_preprocess_all_datasets():
    """
    Preprocesses all datasets to be ready for classification task.
    This will include stemming, word correction, lower-casing, hashtag removal, special char removal.
    """
    
    for i in range(0,len(utils.annotated_datasets)):
        tweetlines = utils.get_dataset(utils.annotated_datasets[i])
        tweets = []
        for line in tweetlines:
            if len(line)>1:
                tweets.append(tweet.to_tweet(line))
        
#        tweets = lower_case(tweets)
        tweets = remove_hastags_and_users(tweets)
        tweets = count_emoticons(tweets)
        tweets = replace_links(tweets)
        tweets = remove_specialchars(tweets)
        tweets = correct_words(tweets)
        tweets = stem(tweets)
        tweets = tokenize(tweets)
        tweets = pos_tag(tweets)
        tweets = count_exclamations(tweets)

        analyzer = Analyzer(utils.annotated_datasets[i], tweets)
        stats = analyzer.analyze()
        print stats
        #store tweets in pickles...
        print "Storing pickles..."
        utils.store_pickles(tweets, utils.annotated_datasets[i][24:len(utils.annotated_datasets[i])-4])
Exemple #2
0
def preprocess_temporal_dataset():
    tweetlines = utils.get_dataset(utils.complete_datasets[3])
    tweets = []
    for line in tweetlines:
        if len(line) > 1:
            tweets.append(tweet.to_tweet(line))
    tweets = preprocessing.preprocess_tweets(tweets)
    sentiments = lexicon.perform_google_sentiment_lexicon_lookup(tweets)
    pickle.dump(sentiments, open("temporal_sentiments", "wb"))
    pickle.dump(tweets, open("temporal_tweets2", "wb"))
def initial_preprocess_all_datasets():
    """
    Runs first preprocessing iteration on all datasets.
    This is the preprocessing routine performed initially on the datasets before annotation.
    This routine includes duplicate removal
    """
        
    for i in range(0,len(utils.datasets)):
        #Fetch from dataset
        tweets = []
        tweetlines = utils.get_dataset(utils.complete_datasets[i])
        for tweetline in tweetlines:
            tweets.append(tweet.to_tweet(tweetline))
            
        #Perform preprocessing
        tweets = remove_duplicates_and_retweets(tweets)

        #Store back to dataset
        tweetlines = []
        for t in tweets:
            tweetlines.append(t.to_tsv())
        utils.store_dataset(tweetlines, utils.datasets[i])
replacement_chars = {u"&": u"og",
                     u"6amp;": u"og",
                     u"+": u"og"}
        
if __name__ == '__main__':
    #Testing
#    tweets = [Tweet("13:37", "johnarne", "Jeg () haaater drittt!!!? :( #justinbieber"), Tweet("13:37", "johnarne", "Jeg eeelsker @erna_solberg http://www.erna.no :) #love #jernerna" )]
#    for tweet in tweets:
#        tweet.set_sentiment("negative")
#        print tweet
    
    tweetlines = utils.get_dataset("test_annotated_data/erna_dataset.tsv")
    tweets = []
    for line in tweetlines:
        if len(line)>1:
            tweets.append(tweet.to_tweet(line))
        
    
#    tweets = lower_case(tweets)
    tweets = remove_hastags_and_users(tweets)
    tweets = count_emoticons(tweets)
    tweets = replace_links(tweets)
    tweets = remove_specialchars(tweets)
    for tweet in tweets:
        print tweet
    tweets = correct_words(tweets)
    tweets = stem(tweets)
    tweets = tokenize(tweets)
    for tweet in tweets:
        print tweet.stat_str()
    tweets = pos_tag(tweets)
def user_annotation():
    """
    Feed tweets to console one at a time, and ask user for sentiment annotation.
    """
    dataset = utils.select_dataset()
    text_tweets = utils.get_dataset(dataset)
    tweets = []
    for text_tweet in text_tweets:
        tweets.append(tweet.to_tweet(text_tweet))
    username = raw_input("Name? ... ")
    
    print "\n--------------\n"
    print "Input: "
    print "\n1: Negative sentiment (Negative opinion). \n2: Neutral/objective sentiment (No opinion). \n3: Positive sentiment (Positive opinion). \n5: Delete the tweet from the dataset. \nx: Cancel sequence. 0: Go back to previous tweet. "
    print "\n--------------\n"
    
    annotated_to = 0
    i = 0
    while i < len(tweets):
#        tweets[i].text.encode('utf8')
#        text = tweets[i].text
#        tweets[i].text = text.decode('utf8')
        try:
            print "Tweet nr. : "+str(i+1)
            print str(((i+1.0*1.0)/len(tweets)*1.0)*100)+" % done "
            print unicode(tweets[i].__str__().decode('utf8'))
        except UnicodeEncodeError:
            try:
                print "Tweet nr. : "+str(i+1)
                print str(tweets[i])
            except UnicodeEncodeError:
                print "Could not print tweet number "+str(i+1) +". Deleting tweet..."
                tweets.remove(tweets[i])
                continue
        
        userinput = raw_input("...")
        while not legal_input(userinput):
            userinput = raw_input("Unlawful input! Please re-introduce.")
        if userinput is '1':
            tweets[i].set_sentiment("negative")
        elif userinput is '2':
            tweets[i].set_sentiment("neutral")
        elif userinput is '3':
            tweets[i].set_sentiment("positive")
        elif userinput is '5':
            print "Deleting tweet..."
            tweets.remove(tweets[i])
            continue
        elif userinput is '0':
            i = i-1
            continue
        elif userinput is 'x':
            break
        i = i+1
        
        
    #TODO: need to encode to utf when getting from dataset?!?!
    #Store the sentiment in file!
    tweetlines = []
    for t in tweets[:i]:
        if t.get_sentiment() is None:
            continue
        tweetlines.append(t.to_tsv())
    dir = username+"_annotated_data"
    if not os.path.exists(dir):
        os.makedirs(dir)
    utils.store_dataset(tweetlines, dir+dataset[4:])
    
    print "Domo arigato!"