-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_proccessing.py
122 lines (112 loc) · 4.44 KB
/
text_proccessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from frequency_vector import freqdict, normalize
character_ngrams=[2,3,4]
word_ngrams=[2,3]
test=None
try:
test=wn.synsets("test")
except:
print("Problem with WordNet, make sure it is downloaded. (run 'nltk.download()' )")
exit(1)
try:
swn.senti_synsets("test")
except BaseException as e:
print("Problem with SentiWordNet, make sure it is downloaded. (run 'nltk.download()' )")
exit(1)
def proccess_book(bookdata):
for x in bookdata.chapters:
proccess_chapter(x)
def proccess_chapter(chapter):
chapter_register={}
chapter_register["validwords"]=0
chapter_register["word_freq_dict"]=freqdict()
chapter_register["region_freq_dict"]=freqdict()
chapter_register["topic_freq_dict"]=freqdict()
chapter_register["usage_freq_dict"]=freqdict()
chapter_register["sense_dist_dict"]=freqdict()
chapter_register["character_ngrams"]={}
for n in character_ngrams:
chapter_register["character_ngrams"][n]=freqdict()
chapter_register["word_ngrams"]={}
for n in word_ngrams:
chapter_register["word_ngrams"][n]=freqdict()
for x in chapter.paragraphs:
proccess_paragraph(x,chapter_register)
normalize(chapter_register["word_freq_dict"])
normalize(chapter_register["region_freq_dict"])
normalize(chapter_register["topic_freq_dict"])
normalize(chapter_register["usage_freq_dict"])
normalize(chapter_register["sense_dist_dict"],cachetotal="total")
for ngrams in chapter_register["character_ngrams"]:
normalize(chapter_register["character_ngrams"][ngrams])
for ngrams in chapter_register["word_ngrams"]:
normalize(chapter_register["word_ngrams"][ngrams])
chapter.chapter_register=chapter_register
def proccess_paragraph(paragraphdata,chapter_register):
for x in paragraphdata.sentences:
proccess_sentence(x,chapter_register)
def proccess_sentence(sentencedata,chapter_register):
wordngrams=chapter_register["word_ngrams"]
for n in wordngrams:
if len(sentencedata.words)>n:
ngrams(wordngrams[n],n,sentencedata.words)
for x in sentencedata.words:
words=chapter_register["word_freq_dict"]
words.plusplus(x,1)
try:
sets=wn.synsets(x)
if len(sets)<=0:
break
chapter_register["validwords"]=chapter_register["validwords"]+1
usage=chapter_register["usage_freq_dict"]
usages = [inner for outer in sets for inner in outer.usage_domains()]
for use in usages:
usage.plusplus(use._name,float(1)/len(usages))
topic=chapter_register["topic_freq_dict"]
topics = [inner for outer in sets for inner in outer.topic_domains()]
for top in topics:
topic.plusplus(top._name,float(1)/len(topics))
region=chapter_register["region_freq_dict"]
regions = [inner for outer in sets for inner in outer.region_domains()]
for reg in regions:
region.plusplus(reg._name,float(1)/len(regions))
sense=chapter_register["sense_dist_dict"]
sentis = [swn.senti_synset(synset._name) for synset in sets]
if None in sentis:
continue
for sen in sentis:
sense.plusplus("pos",float(sen.pos_score())/len(sentis))
sense.plusplus("neg",float(sen.neg_score())/len(sentis))
sense.plusplus("obj",float(sen.obj_score())/len(sentis))
sense.plusplus("total",1)
charngrams=chapter_register["character_ngrams"]
for n in character_ngrams:
if len(x)>n:
ngrams(charngrams[n],n,"^"+x+"*")
except BaseException as e:
print(e)
print("problem finding word:"+x)
def ngrams(ngramfreq,n,seq,comb=tuple):
for i in range(len(seq)+1-n):
ngramfreq.plusplus(comb(seq[i:i+n]))
return ngramfreq
class dictreducer(dict):
def __init__(self,red=lambda x,y:x+y):
self.red=red
def append(self,x):
if x in self:
self[x]=self[x]+1
else:
self[x]=1
def __add__(self,other):
res=dictreducer()
for x in self.keys():
res[x]=self[x]
for x in other.keys():
if x in res:
res[x]=self.red(res[x],other[x])
else:
res[x]=other[x]
return res