-
Notifications
You must be signed in to change notification settings - Fork 0
/
corrector.py
150 lines (131 loc) · 5.55 KB
/
corrector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 19 16:17:56 2015
@author: ardavan
"""
from spacyapi import tokenizer
import collections,csv,operator
from misc import finder,alphabet
from helper import Helper
hlp = Helper()
class Corrector(object):
def __init__(self,scope='main',history={}):
self.scope = scope ## scope must be the same as filename (without .txt)
self.commands = {"time":["schedule","remind","between", "difference"],
"langs":["eng","spa"],
"food":[],
"main":["exit","time","directory","next","previous",
'help','list',"places","outline","find"],
"locations":['london', 'paris', 'sydney', 'san fransisco'],
"dirtypes":[],
"outline":["exit","next","expand","tables","goto","level",
"help", "all","find"]}
self.commands['dirtypes'] = hlp.dirtypes();self.commands['food'] = hlp.foodtypes()
self.lang = {"eng":"english","spa":"spanish"}
self.langsel = "eng"
self.history = []
self.correctedHistory = history
self.current = {}
def cur(self):
return self.current
def datacollect(self,filename,setdir):
with open(filename+".txt",'r') as csvfile:
csvreader = csv.reader(csvfile,delimiter=",")
for row in csvreader:
try:
setdir[filename].append(row[1])
except UnicodeEncodeError:
print("error with encoding:"+row[1])
continue
def disting(self,text):
sptok = tokenizer(text);tokens = sptok['tokens']
entities= sptok['entities']; types=sptok['types']
dicfin = {'tokens':tokens,'entities':entities,'types':types,
'words':[],'times':[],'numbers':[],'splits':[]}
for i in tokens:
timefinder = finder(r'[-|/|:|@|\\]',i)
if timefinder.found():
dicfin['times'].append(i)
wordfinder = finder(r'\w+',i)
numberfinder = finder(r'\d+',i)
symbolfinder = finder(r',|\.',i)
timefinder2 = finder(r'\d(th|st|rd)',i.lower())
if wordfinder.found() and numberfinder.found():
if timefinder2.found():
dicfin['times'].append(i)
elif len(wordfinder.result())!=len(numberfinder.result()):
dicfin['splits'].append(i)
if numberfinder.found() and symbolfinder.found():
dicfin['numbers'].append(i)
if numberfinder.found() and (timefinder.found()==False) and (timefinder2.found()==False):
if (i in dicfin['splits']) is False and (symbolfinder.found()==False):
dicfin['numbers'].append(i)
if wordfinder.found() and (numberfinder.found()==False) and (symbolfinder.found()==False):
dicfin['words'].append(i)
self.history.append(dicfin)
self.current = dicfin
def hashtable(self):
temp=[]
for i in self.commands[self.scope]:
temp2=collections.Counter(dict(collections.Counter(i.lower()).most_common()))
temp.append([i.lower(),temp2])
return temp
def match(self,word):
if word in self.commands[self.scope] or len(word)==1:
return [("None",10)]
if self.matchHist(word) != None:
return[(self.matchHist(word),0)]
hashtable = self.hashtable();results={}
wordc=collections.Counter(dict(collections.Counter(word.lower()).most_common()))
ab = alphabet(self.lang[self.langsel]);abcd = ab.retstr()
for i in wordc:
for j in hashtable:
if j[0] in results and results[j[0]] >= 4:
continue
if i in j[1]:
if j[0] in results:
results[j[0]]+=abs(int(wordc[i])-int(j[1][i]))
else:
results[j[0]]=abs(int(wordc[i])-int(j[1][i]))
else:
if j[0] in results:
results[j[0]]+=int(wordc[i])
else:
results[j[0]]=int(wordc[i])
rem = []
for i in results:
results[i]+=abs(1.0-((len(word)*1.2)/len(i)))
for i in results:
if results[i] >= 3.33:
rem.append(i)
else:
for j in hashtable:
if j[0] == i:
for z in j[1]:
if z in wordc:
continue
else:
results[i]+=int(j[1][z])
for i in results:
for k in range(len(i)):
try:
if i[k] != word[k]:
results[i]+=1
except:
pass
if results[i] >= 6 and (i in rem)==False:
rem.append(i)
for i in rem:
del results[i]
resultsorted = sorted(results.items(),key=operator.itemgetter(1))
if len(resultsorted)==0:
return [("None",10)]
else:
return resultsorted
def addHist(self,word,final):
self.correctedHistory[word] = final
def matchHist(self,word):
if word in self.correctedHistory:
return self.correctedHistory[word]
else:
return None