def initial_preprocess_all_datasets():
    """
    Runs first preprocessing iteration on all datasets.
    This is the preprocessing routine performed initially on the datasets before annotation.
    This routine includes duplicate removal
    """
        
    for i in range(0,len(utils.datasets)):
        #Fetch from dataset
        tweets = []
        tweetlines = utils.get_dataset(utils.complete_datasets[i])
        for tweetline in tweetlines:
            tweets.append(tweet.to_tweet(tweetline))
            
        #Perform preprocessing
        tweets = remove_duplicates_and_retweets(tweets)

        #Store back to dataset
        tweetlines = []
        for t in tweets:
            tweetlines.append(t.to_tsv())
        utils.store_dataset(tweetlines, utils.datasets[i])
def user_annotation():
    """
    Feed tweets to console one at a time, and ask user for sentiment annotation.
    """
    dataset = utils.select_dataset()
    text_tweets = utils.get_dataset(dataset)
    tweets = []
    for text_tweet in text_tweets:
        tweets.append(tweet.to_tweet(text_tweet))
    username = raw_input("Name? ... ")
    
    print "\n--------------\n"
    print "Input: "
    print "\n1: Negative sentiment (Negative opinion). \n2: Neutral/objective sentiment (No opinion). \n3: Positive sentiment (Positive opinion). \n5: Delete the tweet from the dataset. \nx: Cancel sequence. 0: Go back to previous tweet. "
    print "\n--------------\n"
    
    annotated_to = 0
    i = 0
    while i < len(tweets):
#        tweets[i].text.encode('utf8')
#        text = tweets[i].text
#        tweets[i].text = text.decode('utf8')
        try:
            print "Tweet nr. : "+str(i+1)
            print str(((i+1.0*1.0)/len(tweets)*1.0)*100)+" % done "
            print unicode(tweets[i].__str__().decode('utf8'))
        except UnicodeEncodeError:
            try:
                print "Tweet nr. : "+str(i+1)
                print str(tweets[i])
            except UnicodeEncodeError:
                print "Could not print tweet number "+str(i+1) +". Deleting tweet..."
                tweets.remove(tweets[i])
                continue
        
        userinput = raw_input("...")
        while not legal_input(userinput):
            userinput = raw_input("Unlawful input! Please re-introduce.")
        if userinput is '1':
            tweets[i].set_sentiment("negative")
        elif userinput is '2':
            tweets[i].set_sentiment("neutral")
        elif userinput is '3':
            tweets[i].set_sentiment("positive")
        elif userinput is '5':
            print "Deleting tweet..."
            tweets.remove(tweets[i])
            continue
        elif userinput is '0':
            i = i-1
            continue
        elif userinput is 'x':
            break
        i = i+1
        
        
    #TODO: need to encode to utf when getting from dataset?!?!
    #Store the sentiment in file!
    tweetlines = []
    for t in tweets[:i]:
        if t.get_sentiment() is None:
            continue
        tweetlines.append(t.to_tsv())
    dir = username+"_annotated_data"
    if not os.path.exists(dir):
        os.makedirs(dir)
    utils.store_dataset(tweetlines, dir+dataset[4:])
    
    print "Domo arigato!"