/
piece_table_editor.py
225 lines (180 loc) · 8.61 KB
/
piece_table_editor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import Trie
from Piece import Piece
class PieceTableEditor:
#Initialization
def __init__(self, document):
self.textLength = len(document)
self.original = document
self.buffer = ""
self.pieces = [Piece(False, 0, len(document))]
self.dictionary = Trie.Trie()
# On windows, the dictionary can often be found at:
# C:/Users/{username}/AppData/Roaming/Microsoft/Spelling/en-US/default.dic
with open("C:/Users/kirti/AppData/Roaming/Microsoft/Spelling/en-US/default.dic") as input_dictionary:
for line in input_dictionary:
words = line.strip().split(" ")
for word in words:
self.dictionary.insert(word.lower())
self.pasteText = ""
#Insert text at a particular index
def insert(self, text, index):
if not len(text):
return
# Finding piece that overlaps the given index
pieceIndex, pieceOffset = self.getPieceIndexAndOffset(index)
current = self.pieces[pieceIndex]
# Append text to added buffer
added_offset = len(self.buffer)
self.buffer += text
self.textLength += len(text)
# If insertion is at the end and the piece points to the end of the add buffer,
# just increase the length
if (current.inBuffer
and pieceOffset == current.offset + current.length == added_offset):
current.length += len(text)
return
# Spliting into three pieces
insertPiece = []
insertPiece.append( Piece(current.inBuffer, current.offset, pieceOffset - current.offset))
insertPiece.append(Piece(True, added_offset, len(text)))
insertPiece.append(Piece(current.inBuffer,pieceOffset,current.length - (pieceOffset - current.offset)))
insertPiece = list(filter(lambda piece: piece.length > 0, insertPiece))
self.pieces = self.replacePieces(pieceIndex, 1, insertPiece)
def getPieceIndexAndOffset(self, index):
remainingOffset = index
for i in range(len(self.pieces)):
piece = self.pieces[i]
if remainingOffset <= piece.length:
return (i, piece.offset + remainingOffset)
remainingOffset -= piece.length
def replacePieces(self, start, replace_count, items):
return self.pieces[:start] + items + self.pieces[start + replace_count:]
def delete(self, index, length):
if length == 0:
return
if length < 0:
self.delete(index + length, -length)
return
if index < 0:
raise IndexError("Text index out of range")
# Get overlapping pieces
startIndex, startOffset = self.getPieceIndexAndOffset(index)
stopIndex, stopOffset = self.getPieceIndexAndOffset(index + length)
self.textLength -= length
# If single piece, check if delete is at the start or end the piece
if startIndex == stopIndex:
piece = self.pieces[startIndex]
if startOffset == piece.offset:
piece.offset += length
piece.length -= length
return
elif stopOffset == piece.offset + piece.length:
piece.length -= length
return
startPiece = self.pieces[startIndex]
endPiece = self.pieces[stopIndex]
# Spliting into two pieces
deletePieces = []
deletePieces.append(Piece(startPiece.inBuffer,startPiece.offset,startOffset - startPiece.offset))
deletePieces.append(Piece(startPiece.inBuffer,startPiece.offset,startOffset - startPiece.offset,))
deletePieces.append(Piece(endPiece.inBuffer,stopOffset,endPiece.length - (stopOffset - endPiece.offset)))
deletePieces = list(filter(lambda piece: piece.length > 0, deletePieces))
delete_count = stopIndex - startIndex + 1
self.pieces = self.replacePieces(
startIndex, delete_count, deletePieces
)
def get_text(self):
document = ""
for piece in self.pieces:
if piece.inBuffer:
document += self.buffer[piece.offset:piece.offset + piece.length]
else:
document += self.original[piece.offset:piece.offset + piece.length]
return document
#Getting string of length at an index
def getString(self, index, length):
if length < 0:
return self.getString(index + length, -length)
document = ""
# Get overlapping pieces
startIndex, startOffset = self.getPieceIndexAndOffset(index)
stopIndex, stopOffset = self.getPieceIndexAndOffset(index + length)
startPiece = self.pieces[startIndex]
buffer = self.buffer if startPiece.inBuffer else self.original
# If single piece, return text from piece
if startIndex == stopIndex:
document = buffer[startOffset:startOffset + length]
else:
document = buffer[startOffset:startPiece.offset + startPiece.length]
for i in range(startIndex + 1, stopIndex + 1):
cur_piece = self.pieces[i]
buffer = self.buffer if cur_piece.inBuffer else self.original
# If the ending piece, only add remaining length to the string
if i == stopIndex:
document += buffer[cur_piece.offset:stopOffset]
else:
document += buffer[cur_piece.offset:cur_piece.offset + cur_piece.length]
return document
def cut(self, i, j):
self.pasteText = self.getString(i, j-i)
self.delete(i, j-i)
def copy(self, i, j):
self.pasteText = self.getString(i, j-i)
def paste(self, i):
self.insert(self.pasteText, i)
def misspellings(self):
result = 0
doc = self.get_text()
doc = set(doc.split(" "))
for word in doc:
if not self.dictionary.search(word):
result = result + 1
return result
import timeit
class EditorBenchmarker:
new_editor_case = """
from __main__ import PieceTableEditor
s = PieceTableEditor("{}")"""
editor_cut_paste = """
for n in range({}):
if n%2 == 0:
s.cut(3, 5)
else:
s.paste(2)"""
editor_copy_paste = """
for n in range({}):
if n%2 == 0:
s.copy(1, 3)
else:
s.paste(2)"""
editor_get_text = """
for n in range({}):
s.get_text()"""
editor_mispellings = """
for n in range({}):
s.misspellings()"""
def __init__(self, cases, N):
self.cases = cases
self.N = N
self.editor_cut_paste = self.editor_cut_paste.format(N)
self.editor_copy_paste = self.editor_copy_paste.format(N)
self.editor_get_text = self.editor_get_text.format(N)
self.editor_mispellings = self.editor_mispellings.format(N)
def benchmark(self):
for i, case in enumerate(self.cases):
print("Evaluating case: {}".format(i+1))
new_editor = self.new_editor_case.format(case)
cut_paste_time = timeit.timeit(stmt=self.editor_cut_paste,setup=new_editor,number=1)
print("{} cut paste operations took {} s".format(self.N, cut_paste_time))
copy_paste_time = timeit.timeit(stmt=self.editor_copy_paste,setup=new_editor,number=1)
print("{} copy paste operations took {} s".format(self.N, copy_paste_time))
get_text_time = timeit.timeit(stmt=self.editor_get_text,setup=new_editor,number=1)
print("{} text retrieval operations took {} s".format(self.N, get_text_time))
mispellings_time = timeit.timeit(stmt=self.editor_mispellings,setup=new_editor,number=1)
print("{} mispelling operations took {} s".format(self.N, mispellings_time))
if __name__ == "__main__":
s1 = "hello friends"
s2 = "hello friends" * 1000000
s3 = "This was ainteresting question and after looking at the assignment, first problem I could see with the editor was it was performing cut/paste operations on a string. As strings are immutable, each call to those functions were costing us generating a new string alltogther with complexity O(n). Another problem I saw with the editor_mispellings function. As in the dictionary there would be a lot of words and varying length, to check for a correct spelling would take comparisons of length of key size of dictionary. I started with solving that problem first with our existing data strctures. Simply using list instead of string could have been better if there are very insert and delete operations otherwise it would also perform with similar complexity"
b = EditorBenchmarker([s1, s2, s3], 100)
b.benchmark()