-
Notifications
You must be signed in to change notification settings - Fork 1
/
annotation.py
88 lines (78 loc) · 2.75 KB
/
annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
'''
Created on 30. sep. 2014
@author: JohnArne
'''
import utils
import tweet
import os
def user_annotation():
"""
Feed tweets to console one at a time, and ask user for sentiment annotation.
"""
dataset = utils.select_dataset()
text_tweets = utils.get_dataset(dataset)
tweets = []
for text_tweet in text_tweets:
tweets.append(tweet.to_tweet(text_tweet))
username = raw_input("Name? ... ")
print "\n--------------\n"
print "Input: "
print "\n1: Negative sentiment (Negative opinion). \n2: Neutral/objective sentiment (No opinion). \n3: Positive sentiment (Positive opinion). \n5: Delete the tweet from the dataset. \nx: Cancel sequence. 0: Go back to previous tweet. "
print "\n--------------\n"
annotated_to = 0
i = 0
while i < len(tweets):
# tweets[i].text.encode('utf8')
# text = tweets[i].text
# tweets[i].text = text.decode('utf8')
try:
print "Tweet nr. : "+str(i+1)
print str(((i+1.0*1.0)/len(tweets)*1.0)*100)+" % done "
print unicode(tweets[i].__str__().decode('utf8'))
except UnicodeEncodeError:
try:
print "Tweet nr. : "+str(i+1)
print str(tweets[i])
except UnicodeEncodeError:
print "Could not print tweet number "+str(i+1) +". Deleting tweet..."
tweets.remove(tweets[i])
continue
userinput = raw_input("...")
while not legal_input(userinput):
userinput = raw_input("Unlawful input! Please re-introduce.")
if userinput is '1':
tweets[i].set_sentiment("negative")
elif userinput is '2':
tweets[i].set_sentiment("neutral")
elif userinput is '3':
tweets[i].set_sentiment("positive")
elif userinput is '5':
print "Deleting tweet..."
tweets.remove(tweets[i])
continue
elif userinput is '0':
i = i-1
continue
elif userinput is 'x':
break
i = i+1
#TODO: need to encode to utf when getting from dataset?!?!
#Store the sentiment in file!
tweetlines = []
for t in tweets[:i]:
if t.get_sentiment() is None:
continue
tweetlines.append(t.to_tsv())
dir = username+"_annotated_data"
if not os.path.exists(dir):
os.makedirs(dir)
utils.store_dataset(tweetlines, dir+dataset[4:])
print "Domo arigato!"
def legal_input(userinput):
"""
Checks input and returns true if the input is legal. Legal input should be "1", "2", "3", "5", or "x"
"""
legal_inputs = ['1','2','3','5','0','x']
if userinput in legal_inputs:
return True
return False