/
formatter.py
575 lines (535 loc) · 24.1 KB
/
formatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
# -*- coding: utf-8 -*-
################
# BUGS:
# initStyles does not properly initialize the font ("Arial" stays as "Calibri")
# Other stuff
# Other stuff
################
import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
# sys.path.insert(0,"/Users/Gordon/Gordon's Files/AutoFormatter/lib")
import filelib, listlib, regexlib, doclib, numlib
import os, string
from docx import Document
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.text import WD_ALIGN_PARAGRAPH
import unicodedata
from docx.shared import Inches
from docx.shared import Pt
from story import Story
import Tkinter as tk
import time
import unzip
# The Formatter is an object that is able to take in a Story object and return a properly
# formatted version.
class Formatter(object):
# Initializes the values used by the Formatter.
def __init__(self):
self.legalChars = "1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM!#$%&*()_+=;:'\"/?.,—<>“”’ "
self.story = None
self.document1 = None
self.document2 = None
self.chapterNames = True
self.progress = "Waiting for input."
self.stage = "Incomplete"
self.step = 0.0
self.steps = 0.0
self.gettingSize = False
self.paragraphDict = {}
self.numRuns = 0.0
def initStyles(self):
self.progress = "Initializing styles..."
styles = self.document2.styles
styles["Heading 1"].font.name = "Arial"
styles["Heading 1"].font.size = Pt(16)
styles["Heading 1"].font.color.rgb = None
styles["Heading 1"].font.bold = True
styles["Heading 1"].paragraph_format.first_line_indent = Inches(0.5)
styles["Heading 2"].font.name = "Arial"
styles["Heading 2"].font.size = Pt(14)
styles["Heading 2"].font.color.rgb = None
styles["Heading 2"].font.bold = True
styles["Heading 2"].font.italic = True
styles["Heading 2"].paragraph_format.first_line_indent = Inches(0.5)
styles["Heading 3"].font.name = "Arial"
styles["Heading 3"].font.size = Pt(13)
styles["Heading 3"].font.color.rgb = None
styles["Heading 3"].font.bold = True
styles["Heading 3"].paragraph_format.first_line_indent = Inches(0.5)
styles["Normal"].font.name = "Times New Roman"
styles["Normal"].font.size = Pt(14)
styles["Normal"].font.color.rgb = None
styles["Normal"].paragraph_format.first_line_indent = Inches(0.5)
# Takes in an object representing the story to be formatted and sets the local
# story value to equal it.
def take(self,story):
self.progress = "Taking an unformatted story..."
self.story = story
self.document1 = Document(story.path)
self.document2 = Document()
self.initStyles()
# Saves the final document to a docx with the '_formatted' suffix in the same directory as the
# original.
def save(self):
self.progress = "Saving formatted document..."
path = self.story.path
document = self.document2
path = regexlib.clipRight(path,".")
path += "_formatted.docx"
document.save(path)
# Creates the document1 (original) and document2 (formatted) Document objects
# and populates the latter using the plaintext version of the former. Also sets
# the base formatting and style of document2.
def build(self):
self.progress = "Building document..."
document1 = self.document1
document2 = self.document2
paragraph_format = document2.styles['Normal'].paragraph_format
paragraph_format.space_before = 0 #Set paragraph spacing to 0 pica.
paragraph_format.space_after = 0
self.steps = 0.0
for p1 in document1.paragraphs: self.steps += 1
for p1 in document1.paragraphs: #Copy each paragraph from document1 to document 2.
p2 = document2.add_paragraph("")
for r1 in p1.runs:
r2 = p2.add_run(r1.text)
r2.font.italic = r1.font.italic
if "Italic" in r1.style.name:
r2.font.italic = True
self.numRuns += 1
self.step += 1.0
self.compressRuns()
# Removes the following symbols from document2: "»","|","«","•"," "
# Also changes em-dashes (—) into triple en-dashes (---) to avoid formatting
# bugs later on.
def removeSymbols(self):
if not self.gettingSize: self.progress = "Removing symbols..."
document2 = self.document2
self.step = 0.0
for paragraph in document2.paragraphs:
for run in paragraph.runs:
text = run.text
text = text.replace(u"—","---")
text = regexlib.removeSub(text,"»")
text = regexlib.removeSub(text,"|")
text = regexlib.removeSub(text,"«")
text = regexlib.removeSub(text,"•")
text = regexlib.removeSub(text," ")
text = regexlib.removeSub(text,"_")
text = regexlib.removeSub(text,"■")
self.step += 1
run.text = text
# Inserts a properly formatted (Heading 1) title at the top of the document.
def insertTitle(self):
if not self.gettingSize: self.progress = self.progress = "Adding title..."
title = self.story.title
style = 'Heading 1'
self.document2.paragraphs[0].insert_paragraph_before(title,style)
# Inserts a properly formatted (Heading 2) author attribution beneath the document
# title.
def insertAuthor(self):
if not self.gettingSize: self.progress = self.progress = "Adding author..."
author = "by " + self.story.author
style = 'Heading 2'
self.document2.paragraphs[1].insert_paragraph_before(author,style)
# Inserts the copyright date beneath the author, if applicable.
def insertCopyright(self):
if not self.gettingSize: self.progress = self.progress = "Adding copyright..."
copyright = self.story.copyright
if copyright == "": return
paragraph = self.document2.paragraphs[2]
copyrightParagraph = paragraph.insert_paragraph_before("")
copyrightParagraph.add_run("(copyright %s)" % copyright).italic = True
# Inserts the publisher, if applicable.
def insertPublisher(self):
if not self.gettingSize: self.progress = self.progress = "Adding publisher..."
publisher = self.story.publisher
if publisher == "": return
paragraph = self.document2.paragraphs[2]
publisherParagraph = paragraph.insert_paragraph_before("")
publisherParagraph.add_run("(publisher %s)" % publisher).italic = True
# Finds each chapter header and formats it appropriately (i.e., "Chapter __" becomes
# Heading 2, and the chapter names, if present, become Heading 3)
def formatChapters(self):
if not self.gettingSize: self.progress = "Formatting chapters..."
chapter = False
for paragraph in self.document2.paragraphs:
text = paragraph.text
if "CHAPTER" in text or "Chapter" in text or "BOOK" in text or "Book" in text:
if len(text) <= len("Chapter XXXXIIII"):
chapter = True
paragraph.style = 'Heading 2'
elif chapter and self.chapterNames:
paragraph.style = 'Heading 3'
chapter = False
else:
chapter = False
self.step += 1
# Removes the scanner error of mashing two separate lines of dialogue together into one
# paragraph.
def fixDoubleQuotes(self):
if not self.gettingSize: self.progress = "Fixing double quotes..."
self.psteps = 0.0
for paragraph in self.document2.paragraphs:
text = paragraph.text
i = regexlib.match(text,u'” “')
if i == -1: i = regexlib.match(text,'" "')
if i != -1:
text1 = text[:i+1]
text2 = text[i+2:]
paragraph.text = text2
paragraph.insert_paragraph_before(text1)
self.step += 1
# Translates unicode punctuation to ASCII punctuation.
def convertPunctuation(self):
if not self.gettingSize: self.progress = "Converting punctuation..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
text = run.text
punctuation = { 0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22 }
text = u'%s' % text
text = text.translate(punctuation).encode('ascii', 'ignore')
run.text = text
self.step += 1
# Replaces '---' with '—' and fixes em-dash spacing errors.
def fixEmDash(self):
if not self.gettingSize: self.progress = "Fixing em-dashes..."
document2 = self.document2
for paragraph in document2.paragraphs:
for run in paragraph.runs:
text = run.text
text = text.replace('---"',u'---”')
text = text.replace('"---',u'“---')
text = text.replace("---",u"—")
text = regexlib.replaceSub(text," — ","—")
text = regexlib.replaceSub(text," —","—")
text = regexlib.replaceSub(text,"— ","—")
run.text = text
self.step += 1
# Fixes some common quotation mark errors and changes ASCII quotation marks
# to the appropriate Unicode quotation marks.
def fixQuotations(self):
if not self.gettingSize: self.progress = "Fixing quotations..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
text = run.text
text = regexlib.replaceSub(text,' "',u' “')
text = regexlib.replaceSub(text,'" ',u'” ')
text = regexlib.replaceSub(text,'."',u'.”')
text = regexlib.replaceSub(text,',"',u',”')
text = regexlib.replaceSub(text,'!"',u'!”')
text = regexlib.replaceSub(text,'?"',u'?”')
text = regexlib.replaceSub(text,'..."',u'...”')
text = regexlib.replaceSub(text,'#—"#',u'#—“#')
text = regexlib.replaceSub(text,'#"—#',u'#”—#')
text = regexlib.replaceSub(text,'#—"',u'#—”')
text = regexlib.replaceSub(text,'"—#',u'“—#')
text = regexlib.replaceSub(text,'’"',u'’”')
text = regexlib.replaceSub(text,'\'"',u'’”')
text = regexlib.replaceSub(text,'";',u'”;')
text = regexlib.replaceSub(text,'":',u'”:')
text = regexlib.replaceSub(text,"' \"",u'’”')
text = regexlib.replaceSub(text,"’ \"",u'’”')
text = regexlib.replaceSub(text,'"',u'“')
text = regexlib.replaceSub(text,"”’",u'’”')
run.text = text
self.step += 1
# Removes the all-caps word(s) beginning each chapter, which are common to
# older stories
def fixCaps(self):
if not self.gettingSize:
self.progress = "Fixing chapter capitalization..."
for paragraph in self.document2.paragraphs:
if len(paragraph.text) < 3:
continue
if paragraph.style.name != "Normal":
continue
if (paragraph.text[0] in string.ascii_uppercase
and paragraph.text[1] in string.ascii_uppercase
and paragraph.text[2] in string.ascii_uppercase):
s = paragraph.text
for i in xrange(1,len(s)):
if s[i] in string.ascii_lowercase: break
if s[i] in string.ascii_uppercase:
s = regexlib.replaceIndex(s,i,string.lower(s[i]))
paragraph.text = s
self.step += 1
def mergeParagraphs(self,p1,p2):
last = None
if p1 == None: return p2
elif p2 == None: return p1
for run in p1.runs:
last = run
if last != None and len(last.text) > 0 and last.text[-1] != " ": last.text += " "
for r2 in p2.runs:
r1 = p1.add_run(r2.text)
r1.font.italic = r2.font.italic
r2.clear()
self.deleteParagraph(p2)
return p1
def deleteParagraph(self,paragraph):
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
# Finds paragraphs beginning with a lowercase character and re-attaches them to the
# end of the preceding paragraph.
def fixCarriageReturn(self):
if not self.gettingSize: self.progress = "Fixing carriage returns..."
last = None
merge = False
for paragraph in self.document2.paragraphs:
text = paragraph.text
if (text == "") or ("*" in text) or (type(last) != str and last != None and (last.text == "" or "*" in last.text)):
last = paragraph
merge = False
if ("CHAPTER" in text or "Chapter" in text or "BOOK" in text or "Book" in text and len(text) <= len("Chapter XXXXIIII")):
last = "CHAPTER"
merge = False
elif self.chapterNames and last == "CHAPTER":
last = "TITLE"
merge = False
elif len(text) >= 1 and text[0] in string.ascii_lowercase and last != "CHAPTER" and last != "TITLE":
last = self.mergeParagraphs(last,paragraph)
merge = False
elif merge and str(type(last)) != "str":
last = self.mergeParagraphs(last,paragraph)
merge = False
elif not (regexlib.endsWith(text,"[.!?-:]") or
regexlib.endsWith(text,"[.!?-:][\"']") or
regexlib.endsWith(text,"[.!?-:]'\"")):
merge = True
last = paragraph
else:
merge = False
last = paragraph
self.step += 1
# Fixes some common apostrophe errors and changes ASCII apostrophes to the
# appropriate Unicode apostrophe.
def fixApostrophes(self):
if not self.gettingSize: self.progress = "Fixing apostrophes..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
text = run.text
text = regexlib.replaceSub(text,"''","\"")
text = regexlib.replaceSub(text," '",u' ‘')
text = regexlib.replaceSub(text,"' ",u'’ ')
text = regexlib.replaceSub(text,".'",u'.’')
text = regexlib.replaceSub(text,",'",u',’')
text = regexlib.replaceSub(text,"!'",u'!’')
text = regexlib.replaceSub(text,"?'",u'?’')
text = regexlib.replaceSub(text,"...'",u'...’')
text = regexlib.replaceSub(text,"'.",u'’.')
text = regexlib.replaceSub(text,"#—'#",u'#—‘#')
text = regexlib.replaceSub(text,"#'—#",u'#’—#')
text = regexlib.replaceSub(text,"#—'",u'#—’')
text = regexlib.replaceSub(text,"'—#",u'‘—#')
text = regexlib.replaceSub(text,"',",u'’,')
text = regexlib.replaceSub(text,"'!",u'’!')
text = regexlib.replaceSub(text,"'?",u'’?')
text = regexlib.replaceSub(text,"'...",u'’...')
text = regexlib.replaceSub(text,"#'#",u'#’#')
text = regexlib.replaceSub(text,"'",u'‘')
run.text = text
self.step += 1
# Removes double spaces and replaces them with single spaces.
def fixDoubleSpace(self):
if not self.gettingSize: self.progress = "Fixing double spaces..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
while regexlib.match(run.text, " ") != -1:
run.text = regexlib.replaceSub(run.text, " ", " ")
self.step += 1
# Changes common ellipse misprints to proper formatting.
def fixEllipses(self):
if not self.gettingSize: self.progress = "Fixing ellipses..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
text = run.text
text = regexlib.replaceSub(text," . . . . .", "...")
text = regexlib.replaceSub(text," . . . .","...")
text = regexlib.replaceSub(text," . . .","...")
text = regexlib.replaceSub(text,". . . . .", "...")
text = regexlib.replaceSub(text,". . . .","...")
text = regexlib.replaceSub(text,". . .","...")
text = regexlib.replaceSub(text,".....","...")
text = regexlib.replaceSub(text,"....","...")
text = regexlib.replaceSub(text," . .",u'...”')
text = regexlib.replaceSub(text,". . ",u"“...")
run.text = text
self.step += 1
# Fixes hyphen spacing, removes nonnecessary asterisks, and removes
# additional symbols.
def fixPunctuation(self):
if not self.gettingSize: self.progress = "Fixing punctuation..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
text = run.text
#Hyphens
text = regexlib.removeSub(text," -")
text = regexlib.replaceSub(text,"- ","-")
#Misc
text = regexlib.replaceSub(text,"/","I")
text = regexlib.removeSub(text,"\\")
text = regexlib.removeSub(text,"^")
#Asterisks
text = regexlib.replaceSub(text,"* * * *", "& & & &")
text = regexlib.removeSub(text,"*")
text = regexlib.replaceSub(text,"& & & &", "* * * *")
# Periods and Spacing
text = regexlib.replaceSub(text,"# .#","# #")
text = regexlib.replaceSub(text," "," ")
text = regexlib.replaceSub(text,"“ ‘","“‘")
run.text = text
self.step += 1
# Replaces/fixes words commonly mistaken by the scanner for one another.
def fixWords(self):
if not self.gettingSize: self.progress = "Fixing words..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
text = run.text
text = regexlib.replaceWord(text,"comer","corner")
text = regexlib.replaceWord(text,"bom","born")
text = regexlib.replaceWord(text,"modem","modern")
text = regexlib.replaceWord(text,"tiling","thing")
text = regexlib.replaceWord(text,"diat","that")
text = regexlib.replaceWord(text,"sec","see")
text = regexlib.replaceWord(text,"secs","sees")
text = regexlib.replaceWord(text,"Fd","I'd")
text = regexlib.replaceWord(text,"diem","them")
text = regexlib.replaceWord(text,"Modem","Modern")
text = regexlib.replaceSub(text,"‘Tm","“I’m")
text = regexlib.replaceSub(text,"Td","“I’d")
text = regexlib.replaceWord(text,"tire","the")
text = regexlib.replaceSub(text,"boy friend","boyfriend")
text = regexlib.replaceSub(text,"girl friend","girlfriend")
text = regexlib.replaceWord(text,"Pie", "He")
text = regexlib.replaceWord(text,"Fie", "He")
run.text = text
self.step += 1
# Retrieves the total number of steps to be taken within the document in order
# to properly set the loading bar.
def getSize(self):
self.progress = "Getting document size..."
document2 = self.document2
self.document2 = Document()
self.steps = 19*self.numRuns
for paragraph in self.document1.paragraphs:
p = self.document2.add_paragraph("")
for run in paragraph.runs:
r = p.add_run("")
self.gettingSize = True
self.format()
self.fix()
self.gettingSize = False
self.steps = self.step
self.step = 0.0
self.document2 = document2
# Spaces and justifies the "* * * *" scene break common to stories.
def formatSceneBreaks(self):
if not self.gettingSize: self.progress = "Separating scenes..."
for paragraph in self.document2.paragraphs:
if paragraph.text == "* * * *":
text = ""
style = "Normal"
paragraph_1 = paragraph.insert_paragraph_before(text,style)
paragraph_2 = paragraph.insert_paragraph_before(paragraph.text,style)
paragraph_2.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
paragraph.text = ""
self.step += 1
def fixPeriodSpacing(self):
if not self.gettingSize: self.progress = "Fixing period spacing..."
for paragraph in self.document2.paragraphs:
for run in paragraph.runs:
text = run.text
while (True):
k = regexlib.match(text,"#. <")
if k == -1: break
new = text[:k+1]
new += text[k+2:]
text = new
run.text = text
self.step += 1
while (True):
k = regexlib.match(text,"# .<")
if k == -1: break
new = text[:k+1]
new += text[k+2:]
text = new
run.text = text
self.step += 1
while (True):
k = regexlib.match(text,"# . >")
if k == -1: break
new = text[:k+1]
new += text[k+3:]
text = new
run.text = text
self.step += 1
def deleteRun(self,run):
r = run._element
r.getparent().remove(r)
r._r = r._element = None
def compressRuns(self):
self.progress = "Compressing runs..."
self.step = 0.0
for paragraph in self.document2.paragraphs:
i = 0
while i < len(paragraph.runs)-1:
r1 = paragraph.runs[i]
r2 = paragraph.runs[i+1]
if r1.font.italic == r2.font.italic:
r1.text = r1.text + r2.text
r2.clear()
self.deleteRun(r2)
i += 1
self.step += 1
# Calls the other methods in their proper order.
def fix(self):
n = 0
if not self.gettingSize: self.progress = "Fixing mistakes..."
self.fixEmDash()
self.fixCarriageReturn()
self.fixDoubleQuotes()
self.fixApostrophes()
self.fixQuotations()
self.fixDoubleSpace()
self.fixEllipses()
self.fixPunctuation()
self.fixWords()
self.fixCaps()
self.fixDoubleQuotes()
self.formatSceneBreaks()
self.insertTitle()
self.insertAuthor()
self.insertCopyright()
self.insertPublisher()
self.fixPeriodSpacing()
# Calls the other methods in their proper order.
def format(self):
if not self.gettingSize: self.progress = "Formatting story..."
self.removeSymbols()
self.formatChapters()
# Opens the formatted docx after it's been saved.
def open(self):
self.progress = "Opening file..."
path = self.story.path
path = regexlib.clipRight(path,".")
path += "_formatted.docx"
os.system("start "+path)
# Sets the local result variable by processing the local story variable.
def run(self):
self.progress = "Running formatter..."
unzip.fixXML(self.story.path)
sys.stdout.flush()
self.build()
start = time.time()
self.getSize()
self.format()
self.fix()
print "%f seconds" % float(int(time.time() - start))
self.save()
self.stage = "Complete"
self.open()
self.step = 0.0
self.steps = 0.0