-
Notifications
You must be signed in to change notification settings - Fork 4
/
tagger.py
72 lines (48 loc) · 1.55 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from nltk.corpus import indian
from nltk.corpus import brown
from nltk import UnigramTagger
import sys
reload(sys)
from nltk.tag import tnt
from nltk.tokenize import sent_tokenize, word_tokenize
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data_hindi)
sys.setdefaultencoding("utf-8")
#english tags
train_sents = brown.tagged_sents()
#hindi tags
train_data_hindi = indian.tagged_sents('hindi.pos')[:-1] //used for training
tnt_pos_tagger.train(train_data_hindi)
def english_tag(eng_tweet):
word_features = []
eng_tweet =nltk.word_tokenize(eng_tweet)
for i,j in nltk.pos_tag(eng_tweet):
if j in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
word_features.append(i)
rating = 0
def hindi_tag(hindi_tweet):
return nltk.word_tokenize(hindi_tweet)
def hinglish_tag(hinglish_tweet):
return
def tagger(tweet):
if(tweet[1] == 'en'):
return english_tag(tweet[0])
elif(tweet[1] =='hi'):
return hindi_tag(tweet[0])
else return hinglish_tag(tweet[0])
word_features = []
for i,j in nltk.pos_tag(eng_tweet):
if j in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
word_features.append(i)
rating = 0
for i in word_features:
with open('words.txt', 'rt') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
if i == row[0]:
print i, row[1]
if row[1] == 'pos':
rating = rating + 1
elif row[1] == 'neg':
rating = rating - 1
hindi_pos =