-
Notifications
You must be signed in to change notification settings - Fork 0
/
linguatools_lucho.py
95 lines (74 loc) · 3.2 KB
/
linguatools_lucho.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class PolarityClasifier():
'''aca voy a implementar un tagger
para reviews que no se sepan si son pos o neg,
utilizando como training la info del sitema, es decir
los reviews pos y negs que ya vienen tageados.'''
def train():
print 'hola'
pass
def get_polarity(sentence):
pass
class CorpusGenerator():
def __init__(self):
pass
def convert_tag(self,tuple_tag):
return "/".join(tuple_tag)
def parse_data(self,list_reviews):
import codecs
fw_pos = codecs.open("../data/rev_pos.pos",'w','utf-8')
fw_neg = codecs.open("../data/rev_neg.pos",'w','utf-8')
for review in list_reviews:
print '---REVIEW---'
print 'rev id: ',review.id
print 'pos: ',review.review_text["text"][1]
pos_new = [self.convert_tag(tuple_tag) for tuple_tag in review.review_text["text"][1] ]
fw_pos.write(u' '.join(pos_new))
fw_pos.write(u'\n')
fw_pos.write(u'\n')
print 'neg: ', review.review_text["text"][3]
neg_new=[self.convert_tag(tuple_tag) for tuple_tag in review.review_text["text"][3] ]
fw_neg.write(u' '.join(neg_new))
fw_neg.write(u'\n')
fw_neg.write(u'\n')
fw_pos.close()
fw_neg.close()
def word(self):
for sent in self.reader.tagged_sents(['review_hotels_pos.pos']):
print sent
class CorpusAnalysis():
def __init__(self):
self.punctuation = ['.',',',';','!','?','_','"','&',"'"]
self.load()
def load(self):
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import WordPunctTokenizer
self.reader = TaggedCorpusReader('../data/', r'.*\.pos')
def words(self):
print self.reader.words(['rev_pos.pos'])
def ngrams(self,words,n=0):
from nltk.corpus import stopwords
word_list2 = [w for w in words if not w in stopwords.words('english') and not w in punctutation]
wprev,wprev1,wprev2 = None,None,None
for i in range(len(word_list2)):
w = word_list2[i]
yield (wprev,wprev1,wprev2,w)
wprev = wprev1
wprev1 = wprev2
wprev2 = w
def freq_dist_words(self):
from nltk import ConditionalFreqDist
from nltk.model import NgramModel
categories = ['rev_neg.pos','rev_pos.pos']
cfd = ConditionalFreqDist((category, word) for category in categories for word in c.ngrams(c.reader.words(category)))
genres = ['rev_neg.pos', 'rev_pos.pos']
modals = ['location','room','size','staff','excellent','poor','good','bad']
print 'neg :', cfd.__getitem__('rev_neg.pos')
print 'pos :', cfd.__getitem__('rev_pos.pos')
#lm = NgramModel(4, self.reader.words(['rev_neg.pos']))
def freq_dist_tags(self):
from nltk import ConditionalFreqDist
from nltk.model import NgramModel
cfd = ConditionalFreqDist((tag,word) for (word,tag) in c.reader.tagged_words(self.cat_pos) if word.isalpha())
return cfd
def MI(self):
pass